From 0daf37ca1355010c05809f7804abba4b37a702ef Mon Sep 17 00:00:00 2001
From: Moritz Hoffmann <antiguru@gmail.com>
Date: Tue, 21 May 2024 22:05:00 -0400
Subject: [PATCH] Generic chunker, plus flatcontainer support

Signed-off-by: Moritz Hoffmann <antiguru@gmail.com>
---
 src/consolidation.rs                          | 275 ++++++++++++++-
 src/operators/arrange/arrangement.rs          |  34 +-
 src/trace/implementations/chunker.rs          | 325 ++++++++++++++++++
 src/trace/implementations/merge_batcher.rs    | 123 ++-----
 .../implementations/merge_batcher_col.rs      |  87 +----
 .../implementations/merge_batcher_flat.rs     | 325 ++++++++++++++++++
 src/trace/implementations/mod.rs              |  69 +++-
 src/trace/implementations/ord_neu.rs          |  42 ++-
 src/trace/implementations/rhh.rs              |   5 +-
 9 files changed, 1046 insertions(+), 239 deletions(-)
 create mode 100644 src/trace/implementations/chunker.rs
 create mode 100644 src/trace/implementations/merge_batcher_flat.rs
diff --git a/src/consolidation.rs b/src/consolidation.rs
index 1409da072..100159a47 100644
--- a/src/consolidation.rs
+++ b/src/consolidation.rs
@@ -10,10 +10,15 @@
 //! you need specific behavior, it may be best to defensively copy, paste, and maintain the
 //! specific behavior you require.
 
+use std::cmp::Ordering;
 use std::collections::VecDeque;
+use timely::Container;
 use timely::container::{ContainerBuilder, PushInto, SizableContainer};
+use timely::container::flatcontainer::{FlatStack, Push, Region};
+use timely::container::flatcontainer::impls::tuple::{TupleABCRegion, TupleABRegion};
 use crate::Data;
-use crate::difference::Semigroup;
+use crate::difference::{IsZero, Semigroup};
+use crate::trace::cursor::IntoOwned;
 
 /// Sorts and consolidates `vec`.
 ///
@@ -210,6 +215,216 @@ where
     }
 }
 
+/// Behavior to sort containers.
+pub trait ContainerSorter<C> {
+    /// Sort `container`, possibly replacing the contents by a different object.
+    fn sort(&mut self, target: &mut C);
+}
+
+/// A generic container sorter for containers where the item implements [`ConsolidateLayout`].
+#[derive(Default)]
+pub struct ExternalContainerSorter<C: Container> {
+    /// Storage to permute item.
+    permutation: Vec<C::Item<'static>>,
+    /// Empty container to write results at.
+    empty: C,
+}
+
+impl<C> ContainerSorter<C> for ExternalContainerSorter<C>
+where
+    for<'a> C: ConsolidateLayout + PushInto<C::Item<'a>>,
+{
+    fn sort(&mut self, container: &mut C) {
+        // SAFETY: `Permutation` is empty, types are equal but have a different lifetime
+        let mut permutation: Vec<C::Item<'_>> = unsafe { std::mem::transmute::<Vec<C::Item<'static>>, Vec<C::Item<'_>>>(std::mem::take(&mut self.permutation)) };
+
+        permutation.extend(container.drain());
+        // permutation.sort_by_key(|item| C::key(item));
+        permutation.sort_by(|a, b| C::cmp(a, b));
+
+        for item in permutation.drain(..) {
+            self.empty.push(item);
+        }
+
+        // SAFETY: `Permutation` is empty, types are equal but have a different lifetime
+        self.permutation = unsafe { std::mem::transmute::<Vec<C::Item<'_>>, Vec<C::Item<'static>>>(permutation) };
+        std::mem::swap(container, &mut self.empty);
+        self.empty.clear();
+    }
+}
+
+/// Sort containers in-place, with specific implementations.
+#[derive(Default, Debug)]
+pub struct InPlaceSorter();
+
+impl<T, R> ContainerSorter<Vec<(T, R)>> for InPlaceSorter
+where
+    T: Ord + Clone,
+    R: Clone,
+{
+    #[inline]
+    fn sort(&mut self, container: &mut Vec<(T, R)>) {
+        container.sort_by(|(a, _), (b, _)| a.cmp(b));
+    }
+}
+
+impl<D, T, R> ContainerSorter<Vec<(D, T, R)>> for InPlaceSorter
+where
+    D: Ord + Clone,
+    T: Ord + Clone,
+    R: Clone,
+{
+    #[inline]
+    fn sort(&mut self, target: &mut Vec<(D, T, R)>) {
+        target.sort_by(|(d1, t1, _), (d2, t2, _)| (d1, t1).cmp(&(d2, t2)));
+    }
+}
+
+/// Layout of data to be consolidated.
+// TODO: This could be split in two, to separate sorting and consolidation.
+pub trait ConsolidateLayout: Container {
+    /// Key portion of data, essentially everything minus the diff
+    type Key<'a>: Eq where Self: 'a;
+
+    /// GAT diff type.
+    type Diff<'a>: IntoOwned<'a, Owned = Self::DiffOwned> where Self: 'a;
+
+    /// Owned diff type.
+    type DiffOwned: for<'a> Semigroup<Self::Diff<'a>>;
+
+    /// Deconstruct an item into key and diff.
+    fn into_parts(item: Self::Item<'_>) -> (Self::Key<'_>, Self::Diff<'_>);
+
+    /// Push an element to a compatible container.
+    fn push_with_diff(&mut self, key: Self::Key<'_>, diff: Self::DiffOwned);
+
+    /// Compare two items by key to sort containers.
+    fn cmp<'a>(item1: &Self::Item<'_>, item2: &Self::Item<'_>) -> Ordering;
+}
+
+impl<D, R> ConsolidateLayout for Vec<(D, R)>
+where
+    D: Ord + Clone + 'static,
+    for<'a> R: Semigroup + IntoOwned<'a, Owned = R> + Clone + 'static,
+{
+    type Key<'a> = D where Self: 'a;
+    type Diff<'a> = R where Self: 'a;
+    type DiffOwned = R;
+
+    fn into_parts(item: Self::Item<'_>) -> (Self::Key<'_>, Self::Diff<'_>) {
+        item
+    }
+
+    fn cmp<'a>(item1: &Self::Item<'_>, item2: &Self::Item<'_>) -> Ordering {
+        item1.0.cmp(&item2.0)
+    }
+
+    fn push_with_diff(&mut self, key: Self::Key<'_>, diff: Self::DiffOwned) {
+        self.push((key, diff));
+    }
+}
+
+impl<D, T, R> ConsolidateLayout for Vec<(D, T, R)>
+where
+    D: Ord + Clone + 'static,
+    T: Ord + Clone + 'static,
+    for<'a> R: Semigroup + IntoOwned<'a, Owned = R> + Clone + 'static,
+{
+    type Key<'a> = (D, T) where Self: 'a;
+    type Diff<'a> = R where Self: 'a;
+    type DiffOwned = R;
+
+    fn into_parts((data, time, diff): Self::Item<'_>) -> (Self::Key<'_>, Self::Diff<'_>) {
+        ((data, time), diff)
+    }
+
+    fn cmp<'a>(item1: &Self::Item<'_>, item2: &Self::Item<'_>) -> Ordering {
+        (&item1.0, &item1.1).cmp(&(&item2.0, &item2.1))
+    }
+
+    fn push_with_diff(&mut self, (data, time): Self::Key<'_>, diff: Self::DiffOwned) {
+        self.push((data, time, diff));
+    }
+}
+
+impl<K, V, T, R> ConsolidateLayout for FlatStack<TupleABCRegion<TupleABRegion<K, V>, T, R>>
+where
+    for<'a> K: Region + Push<<K as Region>::ReadItem<'a>> + Clone + 'static,
+    for<'a> K::ReadItem<'a>: Ord + Copy,
+    for<'a> V: Region + Push<<V as Region>::ReadItem<'a>> + Clone + 'static,
+    for<'a> V::ReadItem<'a>: Ord + Copy,
+    for<'a> T: Region + Push<<T as Region>::ReadItem<'a>> + Clone + 'static,
+    for<'a> T::ReadItem<'a>: Ord + Copy,
+    R: Region + Push<<R as Region>::Owned> + Clone + 'static,
+    for<'a> R::Owned: Semigroup<R::ReadItem<'a>>,
+{
+    type Key<'a> = (K::ReadItem<'a>, V::ReadItem<'a>, T::ReadItem<'a>) where Self: 'a;
+    type Diff<'a> = R::ReadItem<'a> where Self: 'a;
+    type DiffOwned = R::Owned;
+
+    fn into_parts(((key, val), time, diff): Self::Item<'_>) -> (Self::Key<'_>, Self::Diff<'_>) {
+        ((key, val, time), diff)
+    }
+
+    fn cmp<'a>(((key1, val1), time1, _diff1): &Self::Item<'_>, ((key2, val2), time2, _diff2): &Self::Item<'_>) -> Ordering {
+        (K::reborrow(*key1), V::reborrow(*val1), T::reborrow(*time1)).cmp(&(K::reborrow(*key2), V::reborrow(*val2), T::reborrow(*time2)))
+    }
+
+    fn push_with_diff(&mut self, (key, value, time): Self::Key<'_>, diff: Self::DiffOwned) {
+        self.copy(((key, value), time, diff));
+    }
+}
+
+/// Behavior for copying consolidation.
+pub trait ConsolidateContainer<C> {
+    /// Consolidate the contents of `container` and write the result to `target`.
+    fn consolidate_container(container: &mut C, target: &mut C);
+}
+
+/// Container consolidator that requires the container's item to implement [`ConsolidateLayout`].
+#[derive(Default, Debug)]
+pub struct ContainerConsolidator;
+
+impl<C> ConsolidateContainer<C> for ContainerConsolidator
+where
+    C: ConsolidateLayout,
+{
+    /// Consolidate the supplied container.
+    fn consolidate_container(container: &mut C, target: &mut C) {
+        let mut previous: Option<(C::Key<'_>, C::DiffOwned)> = None;
+        for item in container.drain() {
+            let (key, diff) = C::into_parts(item);
+            match &mut previous {
+                // Initial iteration.
+                None => previous = Some((key, diff.into_owned())),
+                Some((prevkey, d)) => {
+                    // Second and following iteration.
+                    if key == *prevkey {
+                        d.plus_equals(&diff);
+                    } else {
+                        // Keys don't match, write down result if non-zero.
+                        if !d.is_zero() {
+                            // Unwrap because we checked for `Some` above.
+                            let (prevkey, diff) = previous.take().unwrap();
+                            target.push_with_diff(prevkey, diff);
+                        }
+                        // Update `previous`
+                        previous = Some((key, diff.into_owned()));
+                    }
+                }
+            }
+        }
+        // Write any residual data.
+        if let Some((previtem, d)) = previous {
+            if !d.is_zero() {
+                target.push_with_diff(previtem, d);
+            }
+        }
+    }
+}
+
+
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -316,6 +531,62 @@ mod tests {
         for i in 0..1024 {
             assert_eq!((i, 0, 2), collected[i]);
         }
-        
+    }
+
+    #[test]
+    fn test_consolidate_container() {
+        let mut data = vec![(1,1), (2, 1), (1, -1)];
+        let mut target = Vec::default();
+        data.sort();
+        ContainerConsolidator::consolidate_container(&mut data, &mut target);
+        assert_eq!(target, [(2,1)]);
+    }
+
+    #[cfg(not(debug_assertions))]
+    const LEN: usize = 256 << 10;
+    #[cfg(not(debug_assertions))]
+    const REPS: usize = 10 << 10;
+
+    #[cfg(debug_assertions)]
+    const LEN: usize = 256 << 1;
+    #[cfg(debug_assertions)]
+    const REPS: usize = 10 << 1;
+
+    #[test]
+    fn test_consolidator_duration() {
+        let mut data = Vec::with_capacity(LEN);
+        let mut data2 = Vec::with_capacity(LEN);
+        let mut target = Vec::new();
+        let mut duration = std::time::Duration::default();
+        for _ in 0..REPS {
+            data.clear();
+            data2.clear();
+            target.clear();
+            data.extend((0..LEN).map(|i| (i/4, -2isize + ((i % 4) as isize))));
+            data2.extend((0..LEN).map(|i| (i/4, -2isize + ((i % 4) as isize))));
+            data.sort_by(|x,y| x.0.cmp(&y.0));
+            let start = std::time::Instant::now();
+            ContainerConsolidator::consolidate_container(&mut data, &mut target);
+            duration += start.elapsed();
+
+            consolidate(&mut data2);
+            assert_eq!(target, data2);
+        }
+        println!("elapsed consolidator {duration:?}");
+    }
+
+    #[test]
+    fn test_consolidator_duration_vec() {
+        let mut data = Vec::with_capacity(LEN);
+        let mut duration = std::time::Duration::default();
+        for _ in 0..REPS {
+            data.clear();
+            data.extend((0..LEN).map(|i| (i/4, -2isize + ((i % 4) as isize))));
+            data.sort_by(|x,y| x.0.cmp(&y.0));
+            let start = std::time::Instant::now();
+            consolidate(&mut data);
+            duration += start.elapsed();
+        }
+        println!("elapsed vec {duration:?}");
     }
 }
diff --git a/src/operators/arrange/arrangement.rs b/src/operators/arrange/arrangement.rs
index 370d085a6..ef14f2b92 100644
--- a/src/operators/arrange/arrangement.rs
+++ b/src/operators/arrange/arrangement.rs
@@ -369,15 +369,6 @@ where
         Tr::Batch: Batch,
         Tr::Batcher: Batcher<Input=C>,
     ;
-
-    /// Arranges updates into a shared trace, using a supplied parallelization contract, with a supplied name.
-    fn arrange_core<P, Tr>(&self, pact: P, name: &str) -> Arranged<G, TraceAgent<Tr>>
-    where
-        P: ParallelizationContract<G::Timestamp, C>,
-        Tr: Trace<Time=G::Timestamp>+'static,
-        Tr::Batch: Batch,
-        Tr::Batcher: Batcher<Input=C>,
-    ;
 }
 
 impl<G, K, V, R> Arrange<G, Vec<((K, V), G::Timestamp, R)>> for Collection<G, (K, V), R>
@@ -395,17 +386,7 @@ where
         Tr::Batcher: Batcher<Input=Vec<((K, V), G::Timestamp, R)>>,
     {
         let exchange = Exchange::new(move |update: &((K,V),G::Timestamp,R)| (update.0).0.hashed().into());
-        self.arrange_core(exchange, name)
-    }
-
-    fn arrange_core<P, Tr>(&self, pact: P, name: &str) -> Arranged<G, TraceAgent<Tr>>
-    where
-        P: ParallelizationContract<G::Timestamp, Vec<((K,V),G::Timestamp,R)>>,
-        Tr: Trace<Time=G::Timestamp>+'static,
-        Tr::Batch: Batch,
-        Tr::Batcher: Batcher<Input=Vec<((K,V),G::Timestamp,R)>>,
-    {
-        arrange_core(&self.inner, pact, name)
+        arrange_core(&self.inner, exchange, name)
     }
 }
 
@@ -583,18 +564,7 @@ where
         Tr::Batcher: Batcher<Input=Vec<((K, ()), G::Timestamp, R)>>,
     {
         let exchange = Exchange::new(move |update: &((K,()),G::Timestamp,R)| (update.0).0.hashed().into());
-        self.arrange_core(exchange, name)
-    }
-
-    fn arrange_core<P, Tr>(&self, pact: P, name: &str) -> Arranged<G, TraceAgent<Tr>>
-    where
-        P: ParallelizationContract<G::Timestamp, Vec<((K,()),G::Timestamp,R)>>,
-        Tr: Trace<Time=G::Timestamp>+'static,
-        Tr::Batch: Batch,
-        Tr::Batcher: Batcher<Input=Vec<((K,()),G::Timestamp,R)>>,
-    {
-        self.map(|k| (k, ()))
-            .arrange_core(pact, name)
+        arrange_core(&self.map(|k| (k, ())).inner, exchange, name)
     }
 }
 
diff --git a/src/trace/implementations/chunker.rs b/src/trace/implementations/chunker.rs
new file mode 100644
index 000000000..55ac911a5
--- /dev/null
+++ b/src/trace/implementations/chunker.rs
@@ -0,0 +1,325 @@
+//! Organize streams of data into sorted chunks.
+
+use std::marker::PhantomData;
+use timely::communication::message::RefOrMut;
+use timely::Container;
+use timely::container::columnation::{Columnation, TimelyStack};
+use timely::container::{PushInto, SizableContainer};
+use crate::consolidation::{consolidate_updates, ConsolidateContainer, ContainerSorter};
+use crate::difference::Semigroup;
+
+/// Behavior to transform streams of data into sorted chunks of regular size.
+pub trait Chunker {
+    /// Input container type.
+    type Input;
+    /// Output container type.
+    type Output;
+
+    /// Accept a container and absorb its contents. The caller must
+    /// call [`extract`] or [`finish`] soon after pushing a container.
+    fn push_container(&mut self, container: RefOrMut<Self::Input>);
+
+    /// Extract all read data, leaving unfinished data behind.
+    fn extract(&mut self) -> Option<Self::Output>;
+
+    /// Unconditionally extract all data, leaving no unfinished data behind.
+    fn finish(&mut self) -> Option<Self::Output>;
+}
+
+/// Chunk a stream of vectors into chains of vectors.
+pub struct VecChunker<T> {
+    pending: Vec<T>,
+    ready: Vec<Vec<T>>,
+}
+
+impl<T> Default for VecChunker<T> {
+    fn default() -> Self {
+        Self {
+            pending: Vec::default(),
+            ready: Vec::default(),
+        }
+    }
+}
+
+impl<T> VecChunker<T> {
+    const BUFFER_SIZE_BYTES: usize = 8 << 10;
+    fn chunk_capacity(&self) -> usize {
+        let size = ::std::mem::size_of::<T>();
+        if size == 0 {
+            Self::BUFFER_SIZE_BYTES
+        } else if size <= Self::BUFFER_SIZE_BYTES {
+            Self::BUFFER_SIZE_BYTES / size
+        } else {
+            1
+        }
+    }
+
+    fn pending_capacity(&self) -> usize {
+        self.chunk_capacity() * 2
+    }
+}
+
+impl<K, V, T, R> Chunker for VecChunker<((K, V), T, R)>
+where
+    K: Ord + Clone,
+    V: Ord + Clone,
+    T: Ord + Clone,
+    R: Semigroup + Clone,
+{
+    type Input = Vec<((K, V), T, R)>;
+    type Output = Self::Input;
+
+    fn push_container(&mut self, container: RefOrMut<Self::Input>) {
+        // Ensure `self.pending` has the desired capacity. We should never have a larger capacity
+        // because we don't write more than capacity elements into the buffer.
+        if self.pending.capacity() < self.pending_capacity() {
+            self.pending.reserve(self.pending_capacity() - self.pending.len());
+        }
+
+        // Form chunks from what's in pending.
+        // This closure does the following:
+        // * If pending is full, consolidate.
+        // * If after consolidation it's more than half full, peel off chunks,
+        //   leaving behind any partial chunk in pending.
+        let form_chunk = |this: &mut Self| {
+            if this.pending.len() == this.pending.capacity() {
+                consolidate_updates(&mut this.pending);
+                if this.pending.len() >= this.chunk_capacity() {
+                    while this.pending.len() > this.chunk_capacity() {
+                        let mut chunk = Vec::with_capacity(this.chunk_capacity());
+                        chunk.extend(this.pending.drain(..chunk.capacity()));
+                        this.ready.push(chunk);
+                    }
+                }
+            }
+        };
+
+        // `container` is either a shared reference or an owned allocations.
+        match container {
+            RefOrMut::Ref(vec) => {
+                let mut slice = &vec[..];
+                while !slice.is_empty() {
+                    let (head, tail) = slice.split_at(std::cmp::min(self.pending.capacity() - self.pending.len(), slice.len()));
+                    slice = tail;
+                    self.pending.extend_from_slice(head);
+                    form_chunk(self);
+                }
+            }
+            RefOrMut::Mut(vec) => {
+                let mut drain = vec.drain(..).peekable();
+                while drain.peek().is_some() {
+                    self.pending.extend((&mut drain).take(self.pending.capacity() - self.pending.len()));
+                    form_chunk(self);
+                }
+            }
+        }
+    }
+
+    fn extract(&mut self) -> Option<Self::Output> {
+        self.ready.pop()
+    }
+
+    fn finish(&mut self) -> Option<Self::Output> {
+        if !self.pending.is_empty() {
+            consolidate_updates(&mut self.pending);
+            while !self.pending.is_empty() {
+                let mut chunk = Vec::with_capacity(self.chunk_capacity());
+                chunk.extend(self.pending.drain(..std::cmp::min(self.pending.len(), chunk.capacity())));
+                self.ready.push(chunk);
+            }
+        }
+        self.ready.pop()
+    }
+}
+
+/// Chunk a stream of vectors into chains of vectors.
+pub struct ColumnationChunker<T: Columnation> {
+    pending: Vec<T>,
+    ready: Vec<TimelyStack<T>>,
+}
+
+impl<T: Columnation> Default for ColumnationChunker<T> {
+    fn default() -> Self {
+        Self {
+            pending: Vec::default(),
+            ready: Vec::default(),
+        }
+    }
+}
+
+impl<T> ColumnationChunker<T>
+where
+    T: Columnation,
+{
+    const BUFFER_SIZE_BYTES: usize = 64 << 10;
+    fn chunk_capacity(&self) -> usize {
+        let size = ::std::mem::size_of::<T>();
+        if size == 0 {
+            Self::BUFFER_SIZE_BYTES
+        } else if size <= Self::BUFFER_SIZE_BYTES {
+            Self::BUFFER_SIZE_BYTES / size
+        } else {
+            1
+        }
+    }
+
+    /// Buffer size for pending updates, currently 2 * [`Self::chunk_capacity`].
+    fn pending_capacity(&self) -> usize {
+        self.chunk_capacity() * 2
+    }
+}
+
+impl<K, V, T, R> Chunker for ColumnationChunker<((K, V), T, R)>
+where
+    K: Columnation + Ord + Clone,
+    V: Columnation + Ord + Clone,
+    T: Columnation + Ord + Clone,
+    R: Columnation + Semigroup + Clone,
+{
+    type Input = Vec<((K, V), T, R)>;
+    type Output = TimelyStack<((K,V),T,R)>;
+
+    fn push_container(&mut self, container: RefOrMut<Self::Input>) {
+        // Ensure `self.pending` has the desired capacity. We should never have a larger capacity
+        // because we don't write more than capacity elements into the buffer.
+        if self.pending.capacity() < self.pending_capacity() {
+            self.pending.reserve(self.pending_capacity() - self.pending.len());
+        }
+
+        // Form chunks from what's in pending.
+        // This closure does the following:
+        // * If pending is full, consolidate.
+        // * If after consolidation it's more than half full, peel off chunks,
+        //   leaving behind any partial chunk in pending.
+        let form_chunk = |this: &mut Self| {
+            if this.pending.len() == this.pending.capacity() {
+                consolidate_updates(&mut this.pending);
+                if this.pending.len() >= this.chunk_capacity() {
+                    while this.pending.len() > this.chunk_capacity() {
+                        let mut chunk = TimelyStack::with_capacity(this.chunk_capacity());
+                        for item in this.pending.drain(..chunk.capacity()) {
+                            chunk.copy(&item);
+                        }
+                        this.ready.push(chunk);
+                    }
+                }
+            }
+        };
+
+        // `container` is either a shared reference or an owned allocations.
+        match container {
+            RefOrMut::Ref(vec) => {
+                let mut slice = &vec[..];
+                while !slice.is_empty() {
+                    let (head, tail) = slice.split_at(std::cmp::min(self.pending.capacity() - self.pending.len(), slice.len()));
+                    slice = tail;
+                    self.pending.extend_from_slice(head);
+                    form_chunk(self);
+                }
+            }
+            RefOrMut::Mut(vec) => {
+                let mut drain = vec.drain(..).peekable();
+                while drain.peek().is_some() {
+                    self.pending.extend((&mut drain).take(self.pending.capacity() - self.pending.len()));
+                    form_chunk(self);
+                }
+            }
+        }
+    }
+
+    fn extract(&mut self) -> Option<Self::Output> {
+        self.ready.pop()
+    }
+
+    fn finish(&mut self) -> Option<Self::Output> {
+        consolidate_updates(&mut self.pending);
+        while !self.pending.is_empty() {
+            let mut chunk = TimelyStack::with_capacity(self.chunk_capacity());
+            for item in self.pending.drain(..std::cmp::min(self.pending.len(), chunk.capacity())) {
+                chunk.copy(&item);
+            }
+            self.ready.push(chunk);
+        }
+        self.ready.pop()
+    }
+}
+
+/// Chunk a stream of vectors into chains of vectors.
+pub struct ContainerChunker<I, O, Sorter, Consolidator>
+where
+    I: Container,
+    for<'a> O: SizableContainer + PushInto<I::ItemRef<'a>>,
+    Sorter: ContainerSorter<O>,
+    Consolidator: ConsolidateContainer<O> + ?Sized,
+{
+    pending: O,
+    empty: O,
+    ready: Vec<O>,
+    sorter: Sorter,
+    _marker: PhantomData<(I, Consolidator)>,
+}
+
+impl<I, O, Sorter, Consolidator> Default for ContainerChunker<I, O, Sorter, Consolidator>
+where
+    I: Container,
+    for<'a> O: SizableContainer + PushInto<I::ItemRef<'a>>,
+    Sorter: ContainerSorter<O> + Default,
+    Consolidator: ConsolidateContainer<O> + ?Sized,
+{
+    fn default() -> Self {
+        Self {
+            pending: O::default(),
+            empty: O::default(),
+            ready: Vec::default(),
+            sorter: Sorter::default(),
+            _marker: PhantomData,
+        }
+    }
+}
+
+impl<I, O, Sorter, Consolidator> Chunker for ContainerChunker<I, O, Sorter, Consolidator>
+where
+    I: Container,
+    for<'a> O: SizableContainer + PushInto<I::ItemRef<'a>>,
+    Sorter: ContainerSorter<O>,
+    Consolidator: ConsolidateContainer<O>,
+{
+    type Input = I;
+    type Output = O;
+
+    fn push_container(&mut self, container: RefOrMut<Self::Input>) {
+        if self.pending.capacity() < O::preferred_capacity() {
+            self.pending.reserve(O::preferred_capacity() - self.pending.len());
+        }
+        // TODO: This uses `IterRef`, which isn't optimal for containers that can move.
+        for item in container.iter() {
+            self.pending.push(item);
+            if self.pending.len() == self.pending.capacity() {
+                self.sorter.sort(&mut self.pending);
+                Consolidator::consolidate_container(&mut self.pending, &mut self.empty);
+                std::mem::swap(&mut self.pending, &mut self.empty);
+                self.empty.clear();
+                if self.pending.len() > self.pending.capacity() / 2 {
+                    self.ready.push(std::mem::take(&mut self.pending));
+                }
+            }
+        }
+    }
+
+    fn extract(&mut self) -> Option<Self::Output> {
+        self.ready.pop()
+    }
+
+    fn finish(&mut self) -> Option<Self::Output> {
+        if !self.pending.is_empty() {
+            self.sorter.sort(&mut self.pending);
+            Consolidator::consolidate_container(&mut self.pending, &mut self.empty);
+            std::mem::swap(&mut self.pending, &mut self.empty);
+            self.empty.clear();
+            if !self.pending.is_empty() {
+                self.ready.push(std::mem::take(&mut self.pending));
+            }
+        }
+        self.ready.pop()
+    }
+}
diff --git a/src/trace/implementations/merge_batcher.rs b/src/trace/implementations/merge_batcher.rs
index bb13cf650..8e4983aed 100644
--- a/src/trace/implementations/merge_batcher.rs
+++ b/src/trace/implementations/merge_batcher.rs
@@ -1,6 +1,7 @@
 //! A general purpose `Batcher` implementation based on radix sort.
 
 use std::collections::VecDeque;
+use std::marker::PhantomData;
 
 use timely::communication::message::RefOrMut;
 use timely::logging::WorkerIdentifier;
@@ -9,16 +10,17 @@ use timely::progress::frontier::AntichainRef;
 use timely::progress::{frontier::Antichain, Timestamp};
 use timely::{Container, PartialOrder};
 
-use crate::consolidation::consolidate_updates;
 use crate::difference::Semigroup;
 use crate::logging::{BatcherEvent, DifferentialEvent};
 use crate::trace::{Batcher, Builder};
 use crate::Data;
+use crate::trace::implementations::chunker::Chunker;
 
 /// Creates batches from unordered tuples.
-pub struct MergeBatcher<M, T>
+pub struct MergeBatcher<C, M, T>
 where
-    M: Merger,
+    C: Chunker<Output=M::Chunk> + Default,
+    M: Merger<Time = T>,
 {
     /// each power-of-two length list of allocations.
     /// Do not push/pop directly but use the corresponding functions
@@ -26,6 +28,8 @@ where
     chains: Vec<Vec<M::Chunk>>,
     /// Stash of empty chunks
     stash: Vec<M::Chunk>,
+    /// Chunker to transform input streams to chunks of data.
+    chunker: C,
     /// Thing to accept data, merge chains, and talk to the builder.
     merger: M,
     /// Logger for size accounting.
@@ -38,12 +42,13 @@ where
     frontier: Antichain<T>,
 }
 
-impl<M, T> Batcher for MergeBatcher<M, T>
+impl<C, M, T> Batcher for MergeBatcher<C, M, T>
 where
+    C: Chunker<Output=M::Chunk> + Default,
     M: Merger<Time = T>,
     T: Timestamp,
 {
-    type Input = M::Input;
+    type Input = C::Input;
     type Output = M::Output;
     type Time = T;
 
@@ -51,6 +56,7 @@ where
         Self {
             logger,
             operator_id,
+            chunker: C::default(),
             merger: M::default(),
             chains: Vec::new(),
             stash: Vec::new(),
@@ -61,9 +67,11 @@ where
 
     /// Push a container of data into this merge batcher. Updates the internal chain structure if
     /// needed.
-    fn push_container(&mut self, container: RefOrMut<M::Input>) {
-        let chain = self.merger.accept(container, &mut self.stash);
-        self.insert_chain(chain);
+    fn push_container(&mut self, container: RefOrMut<C::Input>) {
+        self.chunker.push_container(container);
+        while let Some(chunk) = self.chunker.extract() {
+            self.insert_chain(vec![chunk]);
+        }
     }
 
     // Sealing a batch means finding those updates with times not greater or equal to any time
@@ -72,9 +80,8 @@ where
     // updates with times not greater or equal to `upper`.
     fn seal<B: Builder<Input = Self::Output, Time = Self::Time>>(&mut self, upper: Antichain<T>) -> B::Output {
         // Finish
-        let chain = self.merger.finish(&mut self.stash);
-        if !chain.is_empty() {
-            self.chain_push(chain);
+        while let Some(chunk) = self.chunker.finish() {
+            self.insert_chain(vec![chunk]);
         }
 
         // Merge all remaining chains into a single chain.
@@ -111,9 +118,10 @@ where
     }
 }
 
-impl<M, T> MergeBatcher<M, T>
+impl<C, M, T> MergeBatcher<C, M, T>
 where
-    M: Merger,
+    C: Chunker<Output=M::Chunk> + Default,
+    M: Merger<Time = T>,
 {
     /// Insert a chain and maintain chain properties: Chains are geometrically sized and ordered
     /// by decreasing length.
@@ -178,7 +186,11 @@ where
     }
 }
 
-impl<M: Merger, T> Drop for MergeBatcher<M, T> {
+impl<C, M, T> Drop for MergeBatcher<C, M, T>
+where
+    C: Chunker<Output=M::Chunk> + Default,
+    M: Merger<Time = T>,
+{
     fn drop(&mut self) {
         // Cleanup chain to retract accounting information.
         while self.chain_pop().is_some() {}
@@ -187,8 +199,6 @@ impl<M: Merger, T> Drop for MergeBatcher<M, T> {
 
 /// A trait to describe interesting moments in a merge batcher.
 pub trait Merger: Default {
-    /// The type of update containers received from inputs.
-    type Input;
     /// The internal representation of chunks of data.
     type Chunk: Container;
     /// The output type
@@ -197,10 +207,6 @@ pub trait Merger: Default {
     type Output;
     /// The type of time in frontiers to extract updates.
     type Time;
-    /// Accept a fresh container of input data.
-    fn accept(&mut self, container: RefOrMut<Self::Input>, stash: &mut Vec<Self::Chunk>) -> Vec<Self::Chunk>;
-    /// Finish processing any stashed data.
-    fn finish(&mut self, stash: &mut Vec<Self::Chunk>) -> Vec<Self::Chunk>;
     /// Merge chains into an output chain.
     fn merge(&mut self, list1: Vec<Self::Chunk>, list2: Vec<Self::Chunk>, output: &mut Vec<Self::Chunk>, stash: &mut Vec<Self::Chunk>);
     /// Extract ready updates based on the `upper` frontier.
@@ -229,12 +235,12 @@ pub trait Merger: Default {
 
 /// A merger that knows how to accept and maintain chains of vectors.
 pub struct VecMerger<T> {
-    pending: Vec<T>,
+    _marker: PhantomData<T>,
 }
 
 impl<T> Default for VecMerger<T> {
     fn default() -> Self {
-        Self { pending: Vec::default() }
+        Self { _marker: PhantomData }
     }
 }
 
@@ -251,10 +257,6 @@ impl<T> VecMerger<T> {
         }
     }
 
-    fn pending_capacity(&self) -> usize {
-        self.chunk_capacity() * 2
-    }
-
     /// Helper to get pre-sized vector from the stash.
     #[inline]
     fn empty(&self, stash: &mut Vec<Vec<T>>) -> Vec<T> {
@@ -280,78 +282,9 @@ where
     R: Semigroup + 'static,
 {
     type Time = T;
-    type Input = Vec<((K, V), T, R)>;
     type Chunk = Vec<((K, V), T, R)>;
     type Output = Vec<((K, V), T, R)>;
 
-    fn accept(&mut self, container: RefOrMut<Self::Input>, stash: &mut Vec<Self::Chunk>) -> Vec<Self::Chunk> {
-        // Ensure `self.pending` has the desired capacity. We should never have a larger capacity
-        // because we don't write more than capacity elements into the buffer.
-        if self.pending.capacity() < self.pending_capacity() {
-            self.pending.reserve(self.pending_capacity() - self.pending.len());
-        }
-
-        // Form a chain from what's in pending.
-        // This closure does the following:
-        // * If pending is full, consolidate.
-        // * If after consolidation it's more than half full, peel off a chain of full blocks,
-        //   leaving behind any partial block in pending.
-        // * Merge the new chain with `final_chain` and return it in-place.
-        let form_chain = |this: &mut Self, final_chain: &mut Vec<Self::Chunk>, stash: &mut _| {
-            if this.pending.len() == this.pending.capacity() {
-                consolidate_updates(&mut this.pending);
-                if this.pending.len() >= this.chunk_capacity() {
-                    let mut chain = Vec::default();
-                    while this.pending.len() > this.chunk_capacity() {
-                        let mut chunk = this.empty(stash);
-                        chunk.extend(this.pending.drain(..chunk.capacity()));
-                        chain.push(chunk);
-                    }
-                    if final_chain.is_empty() {
-                        *final_chain = chain;
-                    } else if !chain.is_empty() {
-                        let mut output = Vec::default();
-                        this.merge(std::mem::take(final_chain), chain, &mut output, stash);
-                        *final_chain = output;
-                    }
-                }
-            }
-        };
-
-        let mut final_chain = Vec::default();
-        // `container` is either a shared reference or an owned allocations.
-        match container {
-            RefOrMut::Ref(vec) => {
-                let mut slice = &vec[..];
-                while !slice.is_empty() {
-                    let (head, tail) = slice.split_at(std::cmp::min(self.pending.capacity() - self.pending.len(), slice.len()));
-                    slice = tail;
-                    self.pending.extend_from_slice(head);
-                    form_chain(self, &mut final_chain, stash);
-                }
-            }
-            RefOrMut::Mut(vec) => {
-                while !vec.is_empty() {
-                    self.pending.extend(vec.drain(..std::cmp::min(self.pending.capacity() - self.pending.len(), vec.len())));
-                    form_chain(self, &mut final_chain, stash);
-                }
-            }
-        }
-        final_chain
-    }
-
-    fn finish(&mut self, stash: &mut Vec<Self::Chunk>) -> Vec<Self::Chunk> {
-        // Extract all data from `pending`.
-        consolidate_updates(&mut self.pending);
-        let mut chain = Vec::default();
-        while !self.pending.is_empty() {
-            let mut chunk = self.empty(stash);
-            chunk.extend(self.pending.drain(..std::cmp::min(chunk.capacity(), self.pending.len())));
-            chain.push(chunk);
-        }
-        chain
-    }
-
     fn merge(&mut self, list1: Vec<Self::Chunk>, list2: Vec<Self::Chunk>, output: &mut Vec<Self::Chunk>, stash: &mut Vec<Self::Chunk>) {
         let mut list1 = list1.into_iter();
         let mut list2 = list2.into_iter();
diff --git a/src/trace/implementations/merge_batcher_col.rs b/src/trace/implementations/merge_batcher_col.rs
index 265f2e649..677347015 100644
--- a/src/trace/implementations/merge_batcher_col.rs
+++ b/src/trace/implementations/merge_batcher_col.rs
@@ -1,8 +1,7 @@
 //! A general purpose `Batcher` implementation based on radix sort for TimelyStack.
 
-use crate::consolidation::consolidate_updates;
 use std::cmp::Ordering;
-use timely::communication::message::RefOrMut;
+use std::marker::PhantomData;
 use timely::container::columnation::{Columnation, TimelyStack};
 use timely::progress::frontier::{Antichain, AntichainRef};
 use timely::{Container, Data, PartialOrder};
@@ -13,12 +12,14 @@ use crate::trace::Builder;
 
 /// A merger for timely stacks
 pub struct ColumnationMerger<T> {
-    pending: Vec<T>,
+    _marker: PhantomData<T>,
 }
 
 impl<T> Default for ColumnationMerger<T> {
     fn default() -> Self {
-        Self { pending: Vec::default() }
+        Self {
+            _marker: PhantomData,
+        }
     }
 }
 
@@ -35,11 +36,6 @@ impl<T: Columnation> ColumnationMerger<T> {
         }
     }
 
-    /// Buffer size for pending updates, currently 2 * [`Self::chunk_capacity`].
-    fn pending_capacity(&self) -> usize {
-        self.chunk_capacity() * 2
-    }
-
     /// Helper to get pre-sized vector from the stash.
     #[inline]
     fn empty(&self, stash: &mut Vec<TimelyStack<T>>) -> TimelyStack<T> {
@@ -65,82 +61,9 @@ where
     R: Columnation + Semigroup + 'static,
 {
     type Time = T;
-    type Input = Vec<((K, V), T, R)>;
     type Chunk = TimelyStack<((K, V), T, R)>;
     type Output = TimelyStack<((K, V), T, R)>;
 
-    fn accept(&mut self, container: RefOrMut<Self::Input>, stash: &mut Vec<Self::Chunk>) -> Vec<Self::Chunk> {
-        // Ensure `self.pending` has the desired capacity. We should never have a larger capacity
-        // because we don't write more than capacity elements into the buffer.
-        if self.pending.capacity() < self.pending_capacity() {
-            self.pending.reserve(self.pending_capacity() - self.pending.len());
-        }
-
-        // Form a chain from what's in pending.
-        // This closure does the following:
-        // * If pending is full, consolidate.
-        // * If after consolidation it's more than half full, peel off a chain of full blocks,
-        //   leaving behind any partial block in pending.
-        // * Merge the new chain with `final_chain` and return it in-place.
-        let form_chain = |this: &mut Self, final_chain: &mut Vec<Self::Chunk>, stash: &mut _| {
-            if this.pending.len() == this.pending.capacity() {
-                consolidate_updates(&mut this.pending);
-                if this.pending.len() >= this.chunk_capacity() {
-                    let mut chain = Vec::default();
-                    while this.pending.len() > this.chunk_capacity() {
-                        let mut chunk = this.empty(stash);
-                        for datum in this.pending.drain(..chunk.capacity()) {
-                            chunk.copy(&datum);
-                        }
-                        chain.push(chunk);
-                    }
-                    if final_chain.is_empty() {
-                        *final_chain = chain;
-                    } else if !chain.is_empty() {
-                        let mut output = Vec::default();
-                        this.merge(std::mem::take(final_chain), chain, &mut output, stash);
-                        *final_chain = output;
-                    }
-                }
-            }
-        };
-
-        let mut final_chain = Vec::default();
-        // `container` is either a shared reference or an owned allocations.
-        match container {
-            RefOrMut::Ref(vec) => {
-                let mut slice = &vec[..];
-                while !slice.is_empty() {
-                    let (head, tail) = slice.split_at(std::cmp::min(self.pending.capacity() - self.pending.len(), slice.len()));
-                    slice = tail;
-                    self.pending.extend_from_slice(head);
-                    form_chain(self, &mut final_chain, stash);
-                }
-            }
-            RefOrMut::Mut(vec) => {
-                while !vec.is_empty() {
-                    self.pending.extend(vec.drain(..std::cmp::min(self.pending.capacity() - self.pending.len(), vec.len())));
-                    form_chain(self, &mut final_chain, stash);
-                }
-            }
-        }
-        final_chain
-    }
-
-    fn finish(&mut self, stash: &mut Vec<Self::Chunk>) -> Vec<Self::Chunk> {
-        // Extract all data from `pending`.
-        consolidate_updates(&mut self.pending);
-        let mut chain = Vec::default();
-        while !self.pending.is_empty() {
-            let mut chunk = self.empty(stash);
-            for datum in self.pending.drain(..std::cmp::min(chunk.capacity(), self.pending.len())) {
-                chunk.copy(&datum);
-            }
-            chain.push(chunk);
-        }
-        chain
-    }
-
     fn merge(&mut self, list1: Vec<Self::Chunk>, list2: Vec<Self::Chunk>, output: &mut Vec<Self::Chunk>, stash: &mut Vec<Self::Chunk>) {
         let mut list1 = list1.into_iter();
         let mut list2 = list2.into_iter();
diff --git a/src/trace/implementations/merge_batcher_flat.rs b/src/trace/implementations/merge_batcher_flat.rs
new file mode 100644
index 000000000..4309b4ab1
--- /dev/null
+++ b/src/trace/implementations/merge_batcher_flat.rs
@@ -0,0 +1,325 @@
+//! A general purpose `Batcher` implementation for FlatStack.
+
+use std::cmp::Ordering;
+use std::marker::PhantomData;
+use timely::progress::frontier::{Antichain, AntichainRef};
+use timely::{Container, Data, PartialOrder};
+use timely::container::flatcontainer::{Push, FlatStack, Region, ReserveItems};
+use timely::container::flatcontainer::impls::tuple::{TupleABCRegion, TupleABRegion};
+
+use crate::difference::Semigroup;
+use crate::trace::implementations::merge_batcher::Merger;
+use crate::trace::Builder;
+use crate::trace::cursor::IntoOwned;
+
+/// A merger for flat stacks
+pub struct FlatcontainerMerger<T, R> {
+    _marker: PhantomData<(T, R)>,
+}
+
+impl<T, R> Default for FlatcontainerMerger<T, R> {
+    fn default() -> Self {
+        Self { _marker: PhantomData, }
+    }
+}
+
+impl<T, R: Region> FlatcontainerMerger<T, R> {
+    const BUFFER_SIZE_BYTES: usize = 64 << 10;
+    fn chunk_capacity(&self) -> usize {
+        let size = ::std::mem::size_of::<T>();
+        if size == 0 {
+            Self::BUFFER_SIZE_BYTES
+        } else if size <= Self::BUFFER_SIZE_BYTES {
+            Self::BUFFER_SIZE_BYTES / size
+        } else {
+            1
+        }
+    }
+
+    /// Helper to get pre-sized vector from the stash.
+    #[inline]
+    fn empty(&self, stash: &mut Vec<FlatStack<R>>) -> FlatStack<R> {
+        stash.pop().unwrap_or_else(|| FlatStack::with_capacity(self.chunk_capacity()))
+    }
+
+    /// Helper to return a chunk to the stash.
+    #[inline]
+    fn recycle(&self, mut chunk: FlatStack<R>, stash: &mut Vec<FlatStack<R>>) {
+        // TODO: Should we limit the size of `stash`?
+        if chunk.capacity() == self.chunk_capacity() {
+            chunk.clear();
+            stash.push(chunk);
+        }
+    }
+}
+
+/// Behavior to dissect items of chunks in the merge batcher
+pub trait MergerChunk: Region {
+    /// The key of the update
+    type Key<'a>: Ord where Self: 'a;
+    /// The value of the update
+    type Val<'a>: Ord where Self: 'a;
+    /// The time of the update
+    type Time<'a>: Ord where Self: 'a;
+    /// The diff of the update
+    type Diff<'a> where Self: 'a;
+
+    /// Split a read item into its constituents.
+    fn into_parts<'a>(item: Self::ReadItem<'a>) -> (Self::Key<'a>, Self::Val<'a>, Self::Time<'a>, Self::Diff<'a>);
+}
+
+impl<K,V,T,R> MergerChunk for TupleABCRegion<TupleABRegion<K, V>, T, R>
+where
+    K: Region,
+    for<'a> K::ReadItem<'a>: Ord,
+    V: Region,
+    for<'a> V::ReadItem<'a>: Ord,
+    T: Region,
+    for<'a> T::ReadItem<'a>: Ord,
+    R: Region,
+{
+    type Key<'a> = K::ReadItem<'a> where TupleABCRegion<TupleABRegion<K, V>, T, R>: 'a;
+    type Val<'a> = V::ReadItem<'a> where TupleABCRegion<TupleABRegion<K, V>, T, R>: 'a;
+    type Time<'a> = T::ReadItem<'a> where TupleABCRegion<TupleABRegion<K, V>, T, R>: 'a;
+    type Diff<'a> = R::ReadItem<'a> where TupleABCRegion<TupleABRegion<K, V>, T, R>: 'a;
+
+    fn into_parts<'a>(((key, val), time, diff): <TupleABCRegion<TupleABRegion<K, V>, T, R> as Region>::ReadItem<'a>) -> (Self::Key<'a>, Self::Val<'a>, Self::Time<'a>, Self::Diff<'a>) {
+        (key, val, time, diff)
+    }
+}
+
+impl<K, V, T, R, FR> Merger for FlatcontainerMerger<((K, V), T, R), FR>
+where
+    K: Ord + Clone,
+    V: Ord + Clone,
+    for<'a> T: Ord + PartialOrder + PartialOrder<FR::Time<'a>> + Data,
+    for<'a> R: Default + Semigroup + Semigroup<FR::Diff<'a>> + Data,
+    for<'a> FR: MergerChunk + Push<((K, V), T, R)> + Clone + 'static
+        + ReserveItems<<FR as Region>::ReadItem<'a>>
+        + Push<<FR as Region>::ReadItem<'a>>
+        + Push<((FR::Key<'a>, FR::Val<'a>), FR::Time<'a>, &'a R)>
+        + Push<((FR::Key<'a>, FR::Val<'a>), FR::Time<'a>, FR::Diff<'a>)>,
+    for<'a> FR::Time<'a>: PartialOrder<T> + Copy + IntoOwned<'a, Owned=T>,
+    for<'a> FR::Diff<'a>: IntoOwned<'a, Owned=R>,
+{
+    type Time = T;
+    type Chunk = FlatStack<FR>;
+    type Output = FlatStack<FR>;
+
+    fn merge(&mut self, list1: Vec<Self::Chunk>, list2: Vec<Self::Chunk>, output: &mut Vec<Self::Chunk>, stash: &mut Vec<Self::Chunk>) {
+        let mut list1 = list1.into_iter();
+        let mut list2 = list2.into_iter();
+
+        let mut head1 = <FlatStackQueue<FR>>::from(list1.next().unwrap_or_default());
+        let mut head2 = <FlatStackQueue<FR>>::from(list2.next().unwrap_or_default());
+
+        let mut result = self.empty(stash);
+
+        let mut diff = R::default();
+
+        // while we have valid data in each input, merge.
+        while !head1.is_empty() && !head2.is_empty() {
+            while (result.capacity() - result.len()) > 0 && !head1.is_empty() && !head2.is_empty() {
+                let cmp = {
+                    let x: FR::ReadItem<'_> = head1.peek();
+                    let (key1, val1, time1, _diff) = FR::into_parts(x);
+                    let y: FR::ReadItem<'_> = head2.peek();
+                    let (key2, val2, time2, _diff) = FR::into_parts(y);
+                    ((key1, val1), time1).cmp(&((key2, val2), time2))
+                };
+                match cmp {
+                    Ordering::Less => {
+                        result.copy(head1.pop());
+                    }
+                    Ordering::Greater => {
+                        result.copy(head2.pop());
+                    }
+                    Ordering::Equal => {
+                        let element1 = head1.pop();
+                        let (key, val, time1, diff1) = FR::into_parts(element1);
+                        let element2 = head2.pop();
+                        let (_key, _val, _time2, diff2) = FR::into_parts(element2);
+                        diff1.clone_onto(&mut diff);
+                        diff.plus_equals(&diff2);
+                        if !diff.is_zero() {
+                            result.copy(((key, val), time1, &diff));
+                        }
+                    }
+                }
+            }
+
+            if result.capacity() == result.len() {
+                output.push(result);
+                result = self.empty(stash);
+            }
+
+            if head1.is_empty() {
+                self.recycle(head1.done(), stash);
+                head1 = FlatStackQueue::from(list1.next().unwrap_or_default());
+            }
+            if head2.is_empty() {
+                self.recycle(head2.done(), stash);
+                head2 = FlatStackQueue::from(list2.next().unwrap_or_default());
+            }
+        }
+
+        if result.len() > 0 {
+            output.push(result);
+        } else {
+            self.recycle(result, stash);
+        }
+
+        if !head1.is_empty() {
+            let mut result = self.empty(stash);
+            result.reserve_items(head1.iter());
+            for item in head1.iter() {
+                result.copy(item);
+            }
+            output.push(result);
+        }
+        output.extend(list1);
+
+        if !head2.is_empty() {
+            let mut result = self.empty(stash);
+            result.reserve_items(head2.iter());
+            for item in head2.iter() {
+                result.copy(item);
+            }
+            output.push(result);
+        }
+        output.extend(list2);
+    }
+
+    fn extract(
+        &mut self,
+        merged: Vec<Self::Chunk>,
+        upper: AntichainRef<Self::Time>,
+        frontier: &mut Antichain<Self::Time>,
+        readied: &mut Vec<Self::Chunk>,
+        kept: &mut Vec<Self::Chunk>,
+        stash: &mut Vec<Self::Chunk>,
+    ) {
+        let mut keep = self.empty(stash);
+        let mut ready = self.empty(stash);
+
+        for buffer in merged {
+            for element in buffer.iter() {
+                let (key, val, time, diff) = FR::into_parts(element);
+                // let time_owned = time.flat_to_owned();
+                if upper.less_equal(&time) {
+                    frontier.insert_with(&time, |time| (*time).into_owned());
+                    if keep.len() == keep.capacity() && !keep.is_empty() {
+                        kept.push(keep);
+                        keep = self.empty(stash);
+                    }
+                    keep.copy(((key, val), time, diff));
+                } else {
+                    if ready.len() == ready.capacity() && !ready.is_empty() {
+                        readied.push(ready);
+                        ready = self.empty(stash);
+                    }
+                    ready.copy(((key, val), time, diff));
+                }
+            }
+            // Recycling buffer.
+            self.recycle(buffer, stash);
+        }
+        // Finish the kept data.
+        if !keep.is_empty() {
+            kept.push(keep);
+        }
+        if !ready.is_empty() {
+            readied.push(ready);
+        }
+    }
+
+    fn seal<B: Builder<Input = Self::Output, Time = Self::Time>>(
+        chain: &mut Vec<Self::Chunk>,
+        lower: AntichainRef<Self::Time>,
+        upper: AntichainRef<Self::Time>,
+        since: AntichainRef<Self::Time>,
+    ) -> B::Output {
+        let mut keys = 0;
+        let mut vals = 0;
+        let mut upds = 0;
+        {
+            let mut prev_keyval = None;
+            for buffer in chain.iter() {
+                for element in buffer.iter() {
+                    let (key, val, time, _) = FR::into_parts(element);
+                    if !upper.less_equal(&time) {
+                        if let Some((p_key, p_val)) = prev_keyval {
+                            if p_key != key {
+                                keys += 1;
+                                vals += 1;
+                            } else if p_val != val {
+                                vals += 1;
+                            }
+                        } else {
+                            keys += 1;
+                            vals += 1;
+                        }
+                        upds += 1;
+                        prev_keyval = Some((key, val));
+                    }
+                }
+            }
+        }
+        let mut builder = B::with_capacity(keys, vals, upds);
+        for mut chunk in chain.drain(..) {
+            builder.push(&mut chunk);
+        }
+
+        builder.done(lower.to_owned(), upper.to_owned(), since.to_owned())
+    }
+
+    fn account(chunk: &Self::Chunk) -> (usize, usize, usize, usize) {
+        let (mut size, mut capacity, mut allocations) = (0, 0, 0);
+        let cb = |siz, cap| {
+            size += siz;
+            capacity += cap;
+            allocations += 1;
+        };
+        chunk.heap_size(cb);
+        (chunk.len(), size, capacity, allocations)
+    }
+}
+
+struct FlatStackQueue<R: Region> {
+    list: FlatStack<R>,
+    head: usize,
+}
+
+impl<R: Region> Default for FlatStackQueue<R> {
+    fn default() -> Self {
+        Self::from(Default::default())
+    }
+}
+
+impl<R: Region> FlatStackQueue<R> {
+    fn pop(&mut self) -> R::ReadItem<'_> {
+        self.head += 1;
+        self.list.get(self.head - 1)
+    }
+
+    fn peek(&self) -> R::ReadItem<'_> {
+        self.list.get(self.head)
+    }
+
+    fn from(list: FlatStack<R>) -> Self {
+        FlatStackQueue { list, head: 0 }
+    }
+
+    fn done(self) -> FlatStack<R> {
+        self.list
+    }
+
+    fn is_empty(&self) -> bool {
+        self.head == self.list.len()
+    }
+
+    /// Return an iterator over the remaining elements.
+    fn iter(&self) -> impl Iterator<Item = R::ReadItem<'_>> + Clone {
+        self.list.iter().skip(self.head)
+    }
+}
diff --git a/src/trace/implementations/mod.rs b/src/trace/implementations/mod.rs
index e8d068387..8045a5e4f 100644
--- a/src/trace/implementations/mod.rs
+++ b/src/trace/implementations/mod.rs
@@ -42,9 +42,11 @@ pub mod spine_fueled;
 
 pub mod merge_batcher;
 pub mod merge_batcher_col;
+pub mod merge_batcher_flat;
 pub mod ord_neu;
 pub mod rhh;
 pub mod huffman_container;
+mod chunker;
 
 // Opinionated takes on default spines.
 pub use self::ord_neu::OrdValSpine as ValSpine;
@@ -392,28 +394,28 @@ where
 
 mod flatcontainer {
     use timely::container::columnation::{Columnation, TimelyStack};
-    use timely::container::flatcontainer::{Containerized, FlatStack, Push, Region};
+    use timely::container::flatcontainer::{Containerized, FlatStack, IntoOwned, Push, Region};
     use timely::progress::Timestamp;
     use crate::difference::Semigroup;
     use crate::lattice::Lattice;
     use crate::trace::implementations::{BuilderInput, FlatLayout, Layout, OffsetList, Update};
 
     impl<U: Update> Layout for FlatLayout<U>
-        where
-            U::Key: Containerized,
-            for<'a> <U::Key as Containerized>::Region: Push<U::Key> + Push<<<U::Key as Containerized>::Region as Region>::ReadItem<'a>>,
-            for<'a> <<U::Key as Containerized>::Region as Region>::ReadItem<'a>: Copy + Ord,
-            U::Val: Containerized,
-            for<'a> <U::Val as Containerized>::Region: Push<U::Val> + Push<<<U::Val as Containerized>::Region as Region>::ReadItem<'a>>,
-            for<'a> <<U::Val as Containerized>::Region as Region>::ReadItem<'a>: Copy + Ord,
-            U::Time: Containerized,
-            <U::Time as Containerized>::Region: Region<Owned=U::Time>,
-            for<'a> <U::Time as Containerized>::Region: Push<U::Time> + Push<<<U::Time as Containerized>::Region as Region>::ReadItem<'a>>,
-            for<'a> <<U::Time as Containerized>::Region as Region>::ReadItem<'a>: Copy + Ord,
-            U::Diff: Containerized,
-            <U::Diff as Containerized>::Region: Region<Owned=U::Diff>,
-            for<'a> <U::Diff as Containerized>::Region: Push<U::Diff> + Push<<<U::Diff as Containerized>::Region as Region>::ReadItem<'a>>,
-            for<'a> <<U::Diff as Containerized>::Region as Region>::ReadItem<'a>: Copy + Ord,
+    where
+        U::Key: Containerized,
+        for<'a> <U::Key as Containerized>::Region: Push<U::Key> + Push<<<U::Key as Containerized>::Region as Region>::ReadItem<'a>>,
+        for<'a> <<U::Key as Containerized>::Region as Region>::ReadItem<'a>: Copy + Ord,
+        U::Val: Containerized,
+        for<'a> <U::Val as Containerized>::Region: Push<U::Val> + Push<<<U::Val as Containerized>::Region as Region>::ReadItem<'a>>,
+        for<'a> <<U::Val as Containerized>::Region as Region>::ReadItem<'a>: Copy + Ord,
+        U::Time: Containerized,
+        <U::Time as Containerized>::Region: Region<Owned=U::Time>,
+        for<'a> <U::Time as Containerized>::Region: Push<U::Time> + Push<<<U::Time as Containerized>::Region as Region>::ReadItem<'a>>,
+        for<'a> <<U::Time as Containerized>::Region as Region>::ReadItem<'a>: Copy + Ord,
+        U::Diff: Containerized,
+        <U::Diff as Containerized>::Region: Region<Owned=U::Diff>,
+        for<'a> <U::Diff as Containerized>::Region: Push<U::Diff> + Push<<<U::Diff as Containerized>::Region as Region>::ReadItem<'a>>,
+        for<'a> <<U::Diff as Containerized>::Region as Region>::ReadItem<'a>: Copy + Ord,
     {
         type Target = U;
         type KeyContainer = FlatStack<<U::Key as Containerized>::Region>;
@@ -459,6 +461,41 @@ mod flatcontainer {
             **this == <V as Containerized>::Region::reborrow(other)
         }
     }
+
+    impl<K,V,T,R> BuilderInput<FlatLayout<((K, V), T, R)>> for FlatStack<<((K, V), T, R) as Containerized>::Region>
+    where
+        K: Ord + Containerized + Clone + 'static,
+        for<'a> K::Region: Push<K> + Push<<K::Region as Region>::ReadItem<'a>> + Clone,
+        for<'a> <K::Region as Region>::ReadItem<'a>: Copy + Ord,
+        for<'a> K: PartialEq<<K::Region as Region>::ReadItem<'a>>,
+        V: Ord + Containerized + Clone + 'static,
+        for<'a> V::Region: Push<V> + Push<<V::Region as Region>::ReadItem<'a>> + Clone,
+        for<'a> <V::Region as Region>::ReadItem<'a>: Copy + Ord,
+        for<'a> V: PartialEq<<V::Region as Region>::ReadItem<'a>>,
+        T: Timestamp + Lattice + Containerized + Clone + 'static,
+        for<'a> T::Region: Region<Owned=T> + Push<T> + Push<<T::Region as Region>::ReadItem<'a>> + Clone,
+        for<'a> <T::Region as Region>::ReadItem<'a>: Copy + Ord + IntoOwned<'a, Owned=T> + PartialEq<<T::Region as Region>::ReadItem<'a>>,
+        R: Ord + Clone + Semigroup + Containerized + 'static,
+        for<'a> R::Region: Region<Owned=R> + Push<R> + Push<<R::Region as Region>::ReadItem<'a>> + Clone,
+        for<'a> <R::Region as Region>::ReadItem<'a>: Copy + Ord + IntoOwned<'a, Owned=R> + PartialEq<<R::Region as Region>::ReadItem<'a>>,
+    {
+        type Key<'a> = <K::Region as Region>::ReadItem<'a>;
+        type Val<'a> = <V::Region as Region>::ReadItem<'a>;
+        type Time = T;
+        type Diff = R;
+
+        fn into_parts<'a>(((key, val), time, diff): Self::Item<'a>) -> (Self::Key<'a>, Self::Val<'a>, Self::Time, Self::Diff) {
+            (key, val, time.into_owned(), diff.into_owned())
+        }
+
+        fn key_eq(this: &Self::Key<'_>, other: <<K as Containerized>::Region as Region>::ReadItem<'_>) -> bool {
+            <K as Containerized>::Region::reborrow(*this) == <K as Containerized>::Region::reborrow(other)
+        }
+
+        fn val_eq(this: &Self::Val<'_>, other: <<V as Containerized>::Region as Region>::ReadItem<'_>) -> bool {
+            <V as Containerized>::Region::reborrow(*this) == <V as Containerized>::Region::reborrow(other)
+        }
+    }
 }
 
 impl<K,V,T,R> BuilderInput<Preferred<K, V, T, R>> for TimelyStack<((<K as ToOwned>::Owned, <V as ToOwned>::Owned), T, R)>
diff --git a/src/trace/implementations/ord_neu.rs b/src/trace/implementations/ord_neu.rs
index e475a0b3d..52bf097d5 100644
--- a/src/trace/implementations/ord_neu.rs
+++ b/src/trace/implementations/ord_neu.rs
@@ -10,10 +10,14 @@
 
 use std::rc::Rc;
 use timely::container::columnation::{TimelyStack};
+use timely::container::flatcontainer::{Containerized, FlatStack};
+use crate::consolidation::{ContainerConsolidator, ExternalContainerSorter};
+use crate::trace::implementations::chunker::{ColumnationChunker, ContainerChunker, VecChunker};
 
 use crate::trace::implementations::spine_fueled::Spine;
 use crate::trace::implementations::merge_batcher::{MergeBatcher, VecMerger};
 use crate::trace::implementations::merge_batcher_col::ColumnationMerger;
+use crate::trace::implementations::merge_batcher_flat::FlatcontainerMerger;
 use crate::trace::rc_blanket_impls::RcBuilder;
 
 use super::{Update, Layout, Vector, TStack, Preferred, FlatLayout};
@@ -24,7 +28,7 @@ pub use self::key_batch::{OrdKeyBatch, OrdKeyBuilder};
 /// A trace implementation using a spine of ordered lists.
 pub type OrdValSpine<K, V, T, R> = Spine<
     Rc<OrdValBatch<Vector<((K,V),T,R)>>>,
-    MergeBatcher<VecMerger<((K, V), T, R)>, T>,
+    MergeBatcher<VecChunker<((K,V),T,R)>, VecMerger<((K, V), T, R)>, T>,
     RcBuilder<OrdValBuilder<Vector<((K,V),T,R)>, Vec<((K,V),T,R)>>>,
 >;
 // /// A trace implementation for empty values using a spine of ordered lists.
@@ -33,21 +37,30 @@ pub type OrdValSpine<K, V, T, R> = Spine<
 /// A trace implementation backed by columnar storage.
 pub type ColValSpine<K, V, T, R> = Spine<
     Rc<OrdValBatch<TStack<((K,V),T,R)>>>,
-    MergeBatcher<ColumnationMerger<((K,V),T,R)>, T>,
+    MergeBatcher<ColumnationChunker<((K,V),T,R)>, ColumnationMerger<((K,V),T,R)>, T>,
     RcBuilder<OrdValBuilder<TStack<((K,V),T,R)>, TimelyStack<((K,V),T,R)>>>,
 >;
 
 /// A trace implementation backed by flatcontainer storage.
 pub type FlatValSpine<K, V, T, R> = Spine<
     Rc<OrdValBatch<FlatLayout<((K,V),T,R)>>>,
-    MergeBatcher<ColumnationMerger<((K,V),T,R)>, T>,
-    RcBuilder<OrdValBuilder<FlatLayout<((K,V),T,R)>, TimelyStack<((K,V),T,R)>>>,
+    MergeBatcher<
+        ContainerChunker<
+            Vec<((K,V),T,R)>,
+            FlatStack<<((K,V),T,R) as Containerized>::Region>,
+            ExternalContainerSorter<FlatStack<<((K,V),T,R) as Containerized>::Region>>,
+            ContainerConsolidator,
+        >,
+        FlatcontainerMerger<((K,V),T,R), <((K,V),T,R) as Containerized>::Region>,
+        T,
+    >,
+    RcBuilder<OrdValBuilder<FlatLayout<((K,V),T,R)>, FlatStack<<((K,V),T,R) as Containerized>::Region>>>,
 >;
 
 /// A trace implementation using a spine of ordered lists.
 pub type OrdKeySpine<K, T, R> = Spine<
     Rc<OrdKeyBatch<Vector<((K,()),T,R)>>>,
-    MergeBatcher<VecMerger<((K, ()), T, R)>, T>,
+    MergeBatcher<VecChunker<((K,()),T,R)>, VecMerger<((K, ()), T, R)>, T>,
     RcBuilder<OrdKeyBuilder<Vector<((K,()),T,R)>, Vec<((K,()),T,R)>>>,
 >;
 // /// A trace implementation for empty values using a spine of ordered lists.
@@ -56,21 +69,30 @@ pub type OrdKeySpine<K, T, R> = Spine<
 /// A trace implementation backed by columnar storage.
 pub type ColKeySpine<K, T, R> = Spine<
     Rc<OrdKeyBatch<TStack<((K,()),T,R)>>>,
-    MergeBatcher<ColumnationMerger<((K,()),T,R)>, T>,
+    MergeBatcher<ColumnationChunker<((K,()),T,R)>, ColumnationMerger<((K,()),T,R)>, T>,
     RcBuilder<OrdKeyBuilder<TStack<((K,()),T,R)>, TimelyStack<((K,()),T,R)>>>,
 >;
 
 /// A trace implementation backed by flatcontainer storage.
 pub type FlatKeySpine<K, T, R> = Spine<
-    Rc<OrdValBatch<FlatLayout<((K,()),T,R)>>>,
-    MergeBatcher<ColumnationMerger<((K,()),T,R)>, T>,
-    RcBuilder<OrdValBuilder<FlatLayout<((K,()),T,R)>, TimelyStack<((K,()),T,R)>>>,
+    Rc<OrdKeyBatch<FlatLayout<((K,()),T,R)>>>,
+    MergeBatcher<
+        ContainerChunker<
+            Vec<((K,()),T,R)>,
+            FlatStack<<((K,()),T,R) as Containerized>::Region>,
+            ExternalContainerSorter<FlatStack<<((K,()),T,R) as Containerized>::Region>>,
+            ContainerConsolidator,
+        >,
+        FlatcontainerMerger<((K,()),T,R), <((K,()),T,R) as Containerized>::Region>,
+        T,
+    >,
+    RcBuilder<OrdKeyBuilder<FlatLayout<((K,()),T,R)>, FlatStack<<((K,()),T,R) as Containerized>::Region>>>,
 >;
 
 /// A trace implementation backed by columnar storage.
 pub type PreferredSpine<K, V, T, R> = Spine<
     Rc<OrdValBatch<Preferred<K,V,T,R>>>,
-    MergeBatcher<ColumnationMerger<((<K as ToOwned>::Owned,<V as ToOwned>::Owned),T,R)>,T>,
+    MergeBatcher<ColumnationChunker<((<K as ToOwned>::Owned,<V as ToOwned>::Owned),T,R)>, ColumnationMerger<((<K as ToOwned>::Owned,<V as ToOwned>::Owned),T,R)>,T>,
     RcBuilder<OrdValBuilder<Preferred<K,V,T,R>, TimelyStack<((<K as ToOwned>::Owned,<V as ToOwned>::Owned),T,R)>>>,
 >;
 
diff --git a/src/trace/implementations/rhh.rs b/src/trace/implementations/rhh.rs
index 1d87d55d2..e5d64e236 100644
--- a/src/trace/implementations/rhh.rs
+++ b/src/trace/implementations/rhh.rs
@@ -12,6 +12,7 @@ use abomonation_derive::Abomonation;
 use timely::container::columnation::TimelyStack;
 
 use crate::Hashable;
+use crate::trace::implementations::chunker::{ColumnationChunker, VecChunker};
 use crate::trace::implementations::merge_batcher::{MergeBatcher, VecMerger};
 use crate::trace::implementations::merge_batcher_col::ColumnationMerger;
 use crate::trace::implementations::spine_fueled::Spine;
@@ -24,7 +25,7 @@ use self::val_batch::{RhhValBatch, RhhValBuilder};
 /// A trace implementation using a spine of ordered lists.
 pub type VecSpine<K, V, T, R> = Spine<
     Rc<RhhValBatch<Vector<((K,V),T,R)>>>,
-    MergeBatcher<VecMerger<((K, V), T, R)>, T>,
+    MergeBatcher<VecChunker<((K,V),T,R)>, VecMerger<((K, V), T, R)>, T>,
     RcBuilder<RhhValBuilder<Vector<((K,V),T,R)>, Vec<((K,V),T,R)>>>,
 >;
 // /// A trace implementation for empty values using a spine of ordered lists.
@@ -33,7 +34,7 @@ pub type VecSpine<K, V, T, R> = Spine<
 /// A trace implementation backed by columnar storage.
 pub type ColSpine<K, V, T, R> = Spine<
     Rc<RhhValBatch<TStack<((K,V),T,R)>>>,
-    MergeBatcher<ColumnationMerger<((K,V),T,R)>, T>,
+    MergeBatcher<ColumnationChunker<((K,V),T,R)>, ColumnationMerger<((K,V),T,R)>, T>,
     RcBuilder<RhhValBuilder<TStack<((K,V),T,R)>, TimelyStack<((K,V),T,R)>>>,
 >;
 // /// A trace implementation backed by columnar storage.