From 4b1550db598a8da3f44c1e070e141e1fa60a99e0 Mon Sep 17 00:00:00 2001 From: Frank McSherry Date: Thu, 19 Sep 2024 17:48:41 -0400 Subject: [PATCH 1/2] Demonstrate columnar stuff --- container/Cargo.toml | 2 + container/src/columnar.rs | 88 +++++++++++++++++++++++++++++++++ container/src/lib.rs | 1 + timely/examples/columnar.rs | 98 +++++++++++++++++++++++++++++++++++++ 4 files changed, 189 insertions(+) create mode 100644 container/src/columnar.rs create mode 100644 timely/examples/columnar.rs diff --git a/container/Cargo.toml b/container/Cargo.toml index 4d643425d..ae2060dbc 100644 --- a/container/Cargo.toml +++ b/container/Cargo.toml @@ -9,3 +9,5 @@ edition.workspace = true columnation = { git = "https://github.com/frankmcsherry/columnation" } flatcontainer = "0.5" serde = { version = "1.0", features = ["derive"] } +# columnar = { path = "../../columnar" } +columnar = { git = "https://github.com/frankmcsherry/columnar" } diff --git a/container/src/columnar.rs b/container/src/columnar.rs new file mode 100644 index 000000000..5ca03c1ed --- /dev/null +++ b/container/src/columnar.rs @@ -0,0 +1,88 @@ +//! Present a columnar container as a timely container. + +use serde::{Serialize, Deserialize}; + +pub use columnar::*; +use columnar::common::IterOwn; + +use crate::{Container, SizableContainer, PushInto}; + +/// A container based on a `columnar` store. +#[derive(Clone, Default, Serialize, Deserialize)] +pub struct Columnar { + store: C, +} + +impl Container for Columnar +where + for<'a> &'a C: columnar::Index, +{ + fn len(&self) -> usize { self.store.len() } + fn clear(&mut self) { self.store.clear() } + + type ItemRef<'a> = <&'a C as Index>::Ref where Self: 'a; + type Iter<'a> = IterOwn<&'a C>; + fn iter<'a>(&'a self) -> Self::Iter<'a> { (&self.store).into_iter() } + + type Item<'a> = <&'a C as Index>::Ref where Self: 'a; + type DrainIter<'a> = IterOwn<&'a C>; + fn drain<'a>(&'a mut self) -> Self::DrainIter<'a> { (&self.store).into_iter() } +} + +impl SizableContainer for Columnar +where + for<'a> &'a C: columnar::Index, +{ + fn capacity(&self) -> usize { 1024 } + fn preferred_capacity() -> usize { 1024 } + fn reserve(&mut self, _additional: usize) { } +} + +impl, T> PushInto for Columnar { + #[inline] + fn push_into(&mut self, item: T) { + self.store.push(item); + } +} + + +use columnar::bytes::{AsBytes, FromBytes, serialization::decode}; + +/// A container based on a columnar store, encoded in aligned bytes. +#[derive(Clone, Default)] +pub struct ColumnarBytes { + bytes: B, + phantom: std::marker::PhantomData, +} + +impl + Clone + Default + 'static, C: AsBytes + Clone + Default + 'static> Container for ColumnarBytes +where + for<'a> C::Borrowed<'a> : Len + Clear + Index, +{ + fn len(&self) -> usize { + as FromBytes>::from_bytes(&mut decode(&self.bytes)).len() + } + // Perhpas this should be an enum that allows the bytes to be un-set, but .. not sure what this should do. + fn clear(&mut self) { unimplemented!() } + + type ItemRef<'a> = as Index>::Ref where Self: 'a; + type Iter<'a> = IterOwn>; + fn iter<'a>(&'a self) -> Self::Iter<'a> { + as FromBytes>::from_bytes(&mut decode(&self.bytes)).into_iter() + } + + type Item<'a> = as Index>::Ref where Self: 'a; + type DrainIter<'a> = IterOwn>; + fn drain<'a>(&'a mut self) -> Self::DrainIter<'a> { + as FromBytes>::from_bytes(&mut decode(&self.bytes)).into_iter() + } +} + +impl + Clone + Default + 'static, C: AsBytes + Clone + Default + 'static> SizableContainer for ColumnarBytes +where + for<'a> C::Borrowed<'a> : Len + Clear + Index, +{ + fn capacity(&self) -> usize { 1024 } + fn preferred_capacity() -> usize { 1024 } + fn reserve(&mut self, _additional: usize) { } +} diff --git a/container/src/lib.rs b/container/src/lib.rs index e22b2471a..dfb4c5847 100644 --- a/container/src/lib.rs +++ b/container/src/lib.rs @@ -6,6 +6,7 @@ use std::collections::VecDeque; pub mod columnation; pub mod flatcontainer; +pub mod columnar; /// A container transferring data through dataflow edges /// diff --git a/timely/examples/columnar.rs b/timely/examples/columnar.rs new file mode 100644 index 000000000..b3d01f851 --- /dev/null +++ b/timely/examples/columnar.rs @@ -0,0 +1,98 @@ +//! Wordcount based on flatcontainer. + +use { + std::collections::HashMap, + timely::{Container, container::CapacityContainerBuilder}, + timely::container::columnar::Columnar, + timely::dataflow::channels::pact::{ExchangeCore, Pipeline}, + timely::dataflow::InputHandleCore, + timely::dataflow::operators::{Inspect, Operator, Probe}, + timely::dataflow::ProbeHandle, +}; + +fn main() { + + use timely_container::columnar::Strings; + type Container = Columnar<(Strings, Vec)>; + + // initializes and runs a timely dataflow. + timely::execute_from_args(std::env::args(), |worker| { + let mut input = >>::new(); + let mut probe = ProbeHandle::new(); + + // create a new input, exchange data, and inspect its output + worker.dataflow::(|scope| { + input + .to_stream(scope) + .unary( + Pipeline, + "Split", + |_cap, _info| { + move |input, output| { + while let Some((time, data)) = input.next() { + let mut session = output.session(&time); + for (text, diff) in data.iter().flat_map(|(text, diff)| { + text.split_whitespace().map(move |s| (s, diff)) + }) { + session.give((text, diff)); + } + } + } + }, + ) + .container::() + .unary_frontier( + ExchangeCore::new(|(s, _): &(&str, _)| s.len() as u64), + "WordCount", + |_capability, _info| { + let mut queues = HashMap::new(); + let mut counts = HashMap::new(); + + move |input, output| { + while let Some((time, data)) = input.next() { + queues + .entry(time.retain()) + .or_insert(Vec::new()) + .push(data.take()); + } + + for (key, val) in queues.iter_mut() { + if !input.frontier().less_equal(key.time()) { + let mut session = output.session(key); + for batch in val.drain(..) { + for (word, diff) in batch.iter() { + let total = + if let Some(count) = counts.get_mut(word) { + *count += diff; + *count + } + else { + counts.insert(word.to_string(), *diff); + *diff + }; + session.give((word, total)); + } + } + } + } + + queues.retain(|_key, val| !val.is_empty()); + } + }, + ) + .container::() + .inspect(|x| println!("seen: {:?}", x)) + .probe_with(&mut probe); + }); + + // introduce data and watch! + for round in 0..10 { + input.send(("flat container", 1)); + input.advance_to(round + 1); + while probe.less_than(input.time()) { + worker.step(); + } + } + }) + .unwrap(); +} From bb7ba797f9a213a03bbceb1a60d19b73ce030f23 Mon Sep 17 00:00:00 2001 From: Frank McSherry Date: Wed, 9 Oct 2024 12:42:27 -0400 Subject: [PATCH 2/2] Update to struct vs pair --- timely/Cargo.toml | 1 + timely/examples/columnar.rs | 35 ++++++++++++++++++++++------------- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/timely/Cargo.toml b/timely/Cargo.toml index 5deb5fed8..100b63325 100644 --- a/timely/Cargo.toml +++ b/timely/Cargo.toml @@ -32,3 +32,4 @@ smallvec = { version = "1.13.2", features = ["serde", "const_generics"] } [dev-dependencies] # timely_sort="0.1.6" rand = { version = "0.8", features = ["small_rng"] } +columnar = { git = "https://github.com/frankmcsherry/columnar" } diff --git a/timely/examples/columnar.rs b/timely/examples/columnar.rs index b3d01f851..c2882bd9d 100644 --- a/timely/examples/columnar.rs +++ b/timely/examples/columnar.rs @@ -10,10 +10,19 @@ use { timely::dataflow::ProbeHandle, }; +// Creates `WordCountContainer` and `WordCountReference` structs, +// as well as various implementations relating them to `WordCount`. +#[derive(Columnar)] +struct WordCount { + text: String, + diff: i64, +} + fn main() { - use timely_container::columnar::Strings; - type Container = Columnar<(Strings, Vec)>; + type Container = Columnar<::Container>; + + use columnar::Len; // initializes and runs a timely dataflow. timely::execute_from_args(std::env::args(), |worker| { @@ -31,10 +40,10 @@ fn main() { move |input, output| { while let Some((time, data)) = input.next() { let mut session = output.session(&time); - for (text, diff) in data.iter().flat_map(|(text, diff)| { - text.split_whitespace().map(move |s| (s, diff)) + for wordcount in data.iter().flat_map(|wordcount| { + wordcount.text.split_whitespace().map(move |text| WordCountReference { text, diff: wordcount.diff }) }) { - session.give((text, diff)); + session.give(wordcount); } } } @@ -42,7 +51,7 @@ fn main() { ) .container::() .unary_frontier( - ExchangeCore::new(|(s, _): &(&str, _)| s.len() as u64), + ExchangeCore::new(|x: &WordCountReference<&str,&i64>| x.text.len() as u64), "WordCount", |_capability, _info| { let mut queues = HashMap::new(); @@ -60,17 +69,17 @@ fn main() { if !input.frontier().less_equal(key.time()) { let mut session = output.session(key); for batch in val.drain(..) { - for (word, diff) in batch.iter() { + for wordcount in batch.iter() { let total = - if let Some(count) = counts.get_mut(word) { - *count += diff; + if let Some(count) = counts.get_mut(wordcount.text) { + *count += wordcount.diff; *count } else { - counts.insert(word.to_string(), *diff); - *diff + counts.insert(wordcount.text.to_string(), *wordcount.diff); + *wordcount.diff }; - session.give((word, total)); + session.give(WordCountReference { text: wordcount.text, diff: total }); } } } @@ -87,7 +96,7 @@ fn main() { // introduce data and watch! for round in 0..10 { - input.send(("flat container", 1)); + input.send(WordCountReference { text: "flat container", diff: 1 }); input.advance_to(round + 1); while probe.less_than(input.time()) { worker.step();