From fa16bfd397499a4e7091a8d390bc55d9ad8e9c0c Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 5 Aug 2020 12:33:19 +0300 Subject: [PATCH 01/57] feat: unixfs tree builder --- unixfs/src/dir.rs | 2 + unixfs/src/dir/builder.rs | 764 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 766 insertions(+) create mode 100644 unixfs/src/dir/builder.rs diff --git a/unixfs/src/dir.rs b/unixfs/src/dir.rs index 07af0ea99..218106ca4 100644 --- a/unixfs/src/dir.rs +++ b/unixfs/src/dir.rs @@ -10,6 +10,8 @@ pub use sharded_lookup::{Cache, LookupError, ShardError, ShardedLookup}; mod directory; pub(crate) use directory::{check_directory_supported, UnexpectedDirectoryProperties}; +pub mod builder; + pub(crate) fn check_hamtshard_supported( mut flat: FlatUnixFs<'_>, ) -> Result, ShardError> { diff --git a/unixfs/src/dir/builder.rs b/unixfs/src/dir/builder.rs new file mode 100644 index 000000000..3251fb608 --- /dev/null +++ b/unixfs/src/dir/builder.rs @@ -0,0 +1,764 @@ +use cid::Cid; +use std::collections::hash_map::Entry::*; +use std::collections::{BTreeMap, HashMap}; +use std::fmt::{self, Write}; + +enum Entry { + Leaf(Leaf), + Directory(DirBuilder), +} + +impl fmt::Debug for Entry { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + use Entry::*; + + match self { + Leaf(leaf) => write!(fmt, "Leaf {{ {:?} }}", leaf), + Directory(_) => write!(fmt, "DirBuilder {{ .. }}"), + } + } +} + +impl Entry { + fn as_dir_builder(&mut self) -> Result<&mut DirBuilder, ()> { + use Entry::*; + match self { + Directory(ref mut d) => Ok(d), + _ => Err(()), + } + } +} + +struct Leaf { + link: Cid, + total_size: u64, +} + +impl fmt::Debug for Leaf { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(fmt, "{}, {}", self.link, self.total_size) + } +} + +#[derive(Default, Debug)] +pub struct TreeOptions { + wrap_in_directory: bool, +} + +impl TreeOptions { + /// When true, allow multiple top level entries, otherwise error on the second entry + pub fn with_wrap_in_directory(mut self) -> TreeOptions { + self.wrap_in_directory = true; + self + } +} + +#[derive(Debug)] +pub enum TreeBuildingFailed { + RootedPath(String), + RepeatSlashesInPath(String), + TooManyRootLevelEntries, + DuplicatePath(String), + LeafAsDirectory(String), +} + +impl fmt::Display for TreeBuildingFailed { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + use TreeBuildingFailed::*; + + match self { + RootedPath(s) => write!(fmt, "path is rooted: {:?}", s), + RepeatSlashesInPath(s) => write!(fmt, "path contains repeat slashes: {:?}", s), + TooManyRootLevelEntries => write!( + fmt, + "multiple root level entries while configured wrap_in_directory = false" + ), + // TODO: perhaps we should allow adding two leafs with same Cid? + DuplicatePath(s) => write!(fmt, "path exists already: {:?}", s), + LeafAsDirectory(s) => write!( + fmt, + "attempted to use already added leaf as a subdirectory: {:?}", + s + ), + } + } +} + +impl std::error::Error for TreeBuildingFailed {} + +#[derive(Debug)] +pub struct BufferingTreeBuilder { + /// At the root there can be only one element, unless an option was given to create a new + /// directory surrounding the root elements. + root_builder: DirBuilder, + longest_path: usize, + // used to generate each node an unique id which is used when doing the post order traversal to + // recover all childrens rendered Cids + counter: u64, + opts: TreeOptions, +} + +impl Default for BufferingTreeBuilder { + fn default() -> Self { + Self::new(TreeOptions::default()) + } +} + +impl BufferingTreeBuilder { + pub fn new(opts: TreeOptions) -> Self { + BufferingTreeBuilder { + root_builder: DirBuilder::root(0), + longest_path: 0, + counter: 1, + opts, + } + } + + // metadata has no bearing here + pub fn put_file( + &mut self, + full_path: &str, + target: Cid, + total_size: u64, + ) -> Result<(), TreeBuildingFailed> { + // inserted at the depth + let leaf = Leaf { + link: target, + total_size, + }; + + self.modify_with(full_path, |parent, basename, _| { + parent + .put_leaf(basename, leaf) + .map_err(|_| TreeBuildingFailed::DuplicatePath(full_path.to_string())) + }) + } + + /// Directories get "put" implicitly through the put files, and directories need to be adjusted + /// only when wanting them to have metadata. + pub fn set_metadata( + &mut self, + full_path: &str, + metadata: Metadata, + ) -> Result<(), TreeBuildingFailed> { + // create all paths along the way + // + // set if not set, error otherwise? FIXME: doesn't error atm + self.modify_with(full_path, |parent, basename, id| { + parent + .add_or_get_node(basename, id) + .map_err(|_| TreeBuildingFailed::LeafAsDirectory(full_path.to_string()))? + .set_metadata(metadata); + Ok(()) + }) + } + + fn modify_with(&mut self, full_path: &str, f: F) -> Result<(), TreeBuildingFailed> + where + F: FnOnce(&mut DirBuilder, String, &mut Option) -> Result<(), TreeBuildingFailed>, + { + // create all paths along the way + // + // assuming it's ok to split '/' since that cannot be escaped in linux at least + + self.longest_path = full_path.len().max(self.longest_path); + let mut remaining = full_path.split('/').enumerate().peekable(); + let mut dir_builder = &mut self.root_builder; + + // needed to avoid borrowing into the DirBuilder::new calling closure + let counter = &mut self.counter; + + while let Some((depth, next)) = remaining.next() { + let last = remaining.peek().is_none(); + + match (depth, next, last) { + // this might need to be accepted in case there is just a single file + (0, "", true) => { /* accepted */ } + (0, "", false) => { + return Err(TreeBuildingFailed::RootedPath(full_path.to_string())) + } + (_, "", false) => { + return Err(TreeBuildingFailed::RepeatSlashesInPath( + full_path.to_string(), + )) + } + (_, "", true) => todo!("path ends in slash"), + _ => {} + } + + // our first level can be full given the options + let full = depth == 0 && !self.opts.wrap_in_directory && dir_builder.is_empty(); + + if last { + let mut next_id = Some(*counter); + + let ret = if full { + Err(TreeBuildingFailed::TooManyRootLevelEntries) + } else { + f(dir_builder, next.to_string(), &mut next_id) + }; + + if next_id.is_none() { + *counter += 1; + } + + if ret.is_err() { + // FIXME: there might be a case where we have now stale nodes in our tree but + // cannot figure out an example for that. + } + + return ret; + } + + let parent_id = dir_builder.id; + + dir_builder = match (full, dir_builder.nodes.entry(next.to_string())) { + (_, Occupied(oe)) => oe + .into_mut() + .as_dir_builder() + .map_err(|_| TreeBuildingFailed::LeafAsDirectory(full_path.to_string()))?, + (false, Vacant(ve)) => { + let next_id = *counter; + *counter += 1; + ve.insert(Entry::Directory(DirBuilder::new(parent_id, next_id))) + .as_dir_builder() + .expect("safe: we just inserted a DirBuilder") + } + (true, Vacant(_)) => return Err(TreeBuildingFailed::TooManyRootLevelEntries), + }; + } + + // as the str::split will always return a single element this should not ever be hit + unreachable!( + "walked the full_path but failed to add anything: {:?}", + full_path + ); + } + + /// Called to build the tree. The built tree will have the added files and their implied + /// directory structure, along with the any directory entries which were created using + /// `set_metadata`. To build the whole hierarchy, one must iterate the returned iterator to + /// completion while storing the created blocks. + /// + /// Returned `PostOrderIterator` will use the given `full_path` and `block_buffer` to store + /// it's data during the walk. `PostOrderIterator` implements `Iterator` while also allowing + /// borrowed access via `next_borrowed`. + fn build<'a>( + self, + full_path: &'a mut String, + block_buffer: &'a mut Vec, + ) -> PostOrderIterator<'a> { + full_path.clear(); + block_buffer.clear(); + + PostOrderIterator { + full_path, + old_depth: 0, + block_buffer, + pending: vec![Visited::Descent { + node: self.root_builder, + name: None, + depth: 0, + }], + persisted_cids: Default::default(), + reused_children: Vec::new(), + cid: None, + wrap_in_directory: self.opts.wrap_in_directory, + } + } +} + +#[derive(Debug)] +enum Visited { + Descent { + node: DirBuilder, + name: Option, + depth: usize, + }, + Post { + parent_id: Option, + id: u64, + name: Option, + depth: usize, + leaves: Vec<(String, Leaf)>, + }, +} + +fn update_full_path( + (full_path, old_depth): (&mut String, &mut usize), + name: Option<&str>, + depth: usize, +) { + if depth < 2 { + // initially thought it might be good idea to add slash to all components; removing it made + // it impossible to get back down to empty string, so fixing this for depths 0 and 1. + full_path.clear(); + *old_depth = 0; + } else { + while *old_depth >= depth && *old_depth > 0 { + // we now want to pop the last segment + // this would be easier with pathbuf + let slash_at = full_path.bytes().rposition(|ch| ch == b'/'); + if let Some(slash_at) = slash_at { + full_path.truncate(slash_at); + *old_depth -= 1; + } else { + todo!( + "no last slash_at in {:?} yet {} >= {}", + full_path, + old_depth, + depth + ); + } + } + } + + debug_assert!(*old_depth <= depth); + + if let Some(name) = name { + if !full_path.is_empty() { + full_path.push_str("/"); + } + full_path.push_str(name); + *old_depth += 1; + } + + assert_eq!(*old_depth, depth); +} + +pub struct PostOrderIterator<'a> { + full_path: &'a mut String, + old_depth: usize, + block_buffer: &'a mut Vec, + // our stack of pending work + pending: Vec, + // "communication channel" from nested entries back to their parents + persisted_cids: HashMap, BTreeMap>, + reused_children: Vec, + cid: Option, + // from TreeOptions + wrap_in_directory: bool, +} + +fn identity_cid(number: usize) -> Cid { + use multihash::{Multihash, Sha2_256}; + + let mh = Sha2_256::digest(&number.to_le_bytes()); + Cid::new_v0(mh).unwrap() +} + +#[derive(Debug)] +pub enum TreeConstructionFailed { + // TODO: at least any quick_protobuf errors here? +} + +impl fmt::Display for TreeConstructionFailed { + fn fmt(&self, _fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + todo!() + } +} + +impl std::error::Error for TreeConstructionFailed {} + +impl<'a> PostOrderIterator<'a> { + fn render_directory( + links: &BTreeMap, + _buffer: &mut Vec, + ) -> Result { + Ok(Leaf { + link: identity_cid(links.len()), + total_size: 42, + }) + } + + fn next_borrowed<'b>( + &'b mut self, + ) -> Option> { + while let Some(visited) = self.pending.pop() { + let (name, depth) = match &visited { + Visited::Descent { name, depth, .. } => (name.as_deref(), *depth), + Visited::Post { name, depth, .. } => (name.as_deref(), *depth), + }; + + update_full_path((self.full_path, &mut self.old_depth), name, depth); + + match visited { + Visited::Descent { node, name, depth } => { + let mut leaves = Vec::new(); + + let children = &mut self.reused_children; + + for (k, v) in node.nodes { + match v { + Entry::Directory(node) => children.push(Visited::Descent { + node, + name: Some(k), + depth: depth + 1, + }), + Entry::Leaf(leaf) => leaves.push((k, leaf)), + } + } + + self.pending.push(Visited::Post { + parent_id: node.parent_id, + id: node.id, + name, + depth, + leaves, + }); + + let any_children = !children.is_empty(); + + self.pending.extend(children.drain(..)); + + if any_children { + // we could strive to do everything right now but pushing and popping might + // turn out easier code wise, or in other words, when there are no child_nodes + // we wouldn't need to go through Visited::Post. + } + } + Visited::Post { + parent_id, + id, + name, + leaves, + .. + } => { + // all of our children have now been visited; we should be able to find their + // Cids in the btreemap + let mut collected = self.persisted_cids.remove(&Some(id)).unwrap_or_default(); + + // FIXME: leaves could be drained and reused + collected.extend(leaves); + + if !self.wrap_in_directory && parent_id.is_none() { + // we aren't supposed to wrap_in_directory, and we are now looking at the + // possibly to be generated root directory. + + assert_eq!( + collected.len(), + 1, + "should not have gone this far with multiple added roots" + ); + + return None; + } + + // render unixfs, maybe return it? + let buffer = &mut self.block_buffer; + buffer.clear(); + + let leaf = match Self::render_directory(&collected, buffer) { + Ok(leaf) => leaf, + Err(e) => return Some(Err(e)), + }; + + self.cid = Some(leaf.link.clone()); + + // this reuse strategy is probably good enough + collected.clear(); + + if let Some(name) = name { + // name is none only for the wrap_in_directory, which cannot really be + // propagated up but still the parent_id is allowed to be None + let previous = self + .persisted_cids + .entry(parent_id) + .or_insert(collected) + .insert(name, leaf); + + assert!(previous.is_none()); + } + + if parent_id.is_none() { + // rewrite the full_path for the wrap_in_directory + assert!( + self.full_path.is_empty(), + "full_path should had been empty but it was not: {:?}", + self.full_path + ); + // at the wrap_in_directory level the name should be the root level Cid + write!(self.full_path, "{}", self.cid.as_ref().unwrap()).unwrap(); + self.old_depth += 1; + } + + return Some(Ok(( + self.full_path.as_str(), + self.cid.as_ref().unwrap(), + &self.block_buffer, + ))); + } + } + } + None + } +} + +impl<'a> Iterator for PostOrderIterator<'a> { + type Item = Result<(String, Cid, Box<[u8]>), TreeConstructionFailed>; + + fn next(&mut self) -> Option { + self.next_borrowed().map(|res| { + res.map(|(full_path, cid, block)| (full_path.to_string(), cid.to_owned(), block.into())) + }) + } +} + +struct DuplicateName; +struct FoundLeaf; + +/// Node in a directory tree. +#[derive(Debug)] +struct DirBuilder { + /// Immediate files, symlinks or directories in this directory + nodes: HashMap, + /// Metadata for this directory + metadata: Metadata, + /// Id of the parent; None for the root node + parent_id: Option, + /// Internal id, used for propagating Cids back from children during post order visit. + id: u64, +} + +impl DirBuilder { + fn new(parent_id: u64, id: u64) -> Self { + assert_ne!(parent_id, id); + DirBuilder { + nodes: HashMap::new(), + metadata: Default::default(), + parent_id: Some(parent_id), + id, + } + } + + fn root(id: u64) -> Self { + DirBuilder { + nodes: HashMap::new(), + metadata: Default::default(), + parent_id: None, + id, + } + } + + fn put_leaf(&mut self, key: String, leaf: Leaf) -> Result<(), DuplicateName> { + match self.nodes.entry(key) { + Occupied(_) => Err(DuplicateName), + Vacant(ve) => { + ve.insert(Entry::Leaf(leaf)); + Ok(()) + } + } + } + + fn add_or_get_node( + &mut self, + key: String, + id: &mut Option, + ) -> Result<&mut DirBuilder, FoundLeaf> { + match self.nodes.entry(key) { + Occupied(oe) => oe.into_mut().as_dir_builder().map_err(|_| FoundLeaf), + Vacant(ve) => { + let id = id.take().unwrap(); + let entry = ve.insert(Entry::Directory(Self::new(self.id, id))); + Ok(entry.as_dir_builder().expect("just inserted")) + } + } + } + + fn len(&self) -> usize { + self.nodes.len() + } + + fn is_empty(&self) -> bool { + self.len() != 0 + } + + fn set_metadata(&mut self, metadata: Metadata) { + self.metadata = metadata; + } +} + +#[derive(Default, Debug)] +pub struct Metadata { + mtime: Option<(i64, u32)>, + mode: Option, +} + +#[cfg(test)] +mod tests { + use super::{identity_cid, BufferingTreeBuilder, Metadata, TreeBuildingFailed, TreeOptions}; + use cid::Cid; + use std::convert::TryFrom; + + #[test] + fn some_directories() { + let mut builder = BufferingTreeBuilder::default(); + + // foobar\n + let five_block_foobar = + Cid::try_from("QmRJHYTNvC3hmd9gJQARxLR1QMEincccBV53bBw524yyq6").unwrap(); + + builder + .put_file("a/b/c/d/e/f/g.txt", five_block_foobar.clone(), 1) + .unwrap(); + builder + .put_file("a/b/c/d/e/h.txt", five_block_foobar.clone(), 2) + .unwrap(); + builder + .put_file("a/b/c/d/e/i.txt", five_block_foobar.clone(), 3) + .unwrap(); + + let mut full_path = String::new(); + let mut buffer = Vec::new(); + + let iter = builder.build(&mut full_path, &mut buffer); + let mut actual = iter + .map(|res| res.map(|(p, cid, _)| (p, cid))) + .collect::, _>>() + .unwrap(); + + let mut expected = vec![ + ( + "a/b/c/d/e/f", + "Qmbgf44ztW9wLcGNRNYGinGQB6SQDQtbHVbkM5MrWms698", + ), + ( + "a/b/c/d/e", + "Qma1hCr3CuPRAq2Gw4DCNMqsi42Bjs4Bt1MGSS57kNh144", + ), + ("a/b/c/d", "QmUqaYatcJqiSFdykHXGh4Nog1eMSfDJBeYzcG67KV5Ri4"), + ("a/b/c", "QmYwaNBaGpDCNN9XpHmjxVPHmEXZMw9KDY3uikE2UU5fVB"), + ("a/b", "QmeAzCPig4o4gBLh2LvP96Sr8MUBrsu2Scw9MTq1EvTDhY"), + ("a", "QmSTUFaPwJW8xD4KNRLLQRqVTYtYC29xuhYTJoYPWdzvKp"), + ]; + + // hopefully this way the errors will be easier to hunt down + + actual.reverse(); + expected.reverse(); + + while let Some(actual) = actual.pop() { + let expected = expected.pop().expect("size mismatch"); + assert_eq!(actual.0, expected.0); + assert_eq!(actual.1.to_string(), expected.1, "{:?}", actual.0); + } + } + + #[test] + fn empty_path() { + let mut builder = BufferingTreeBuilder::default(); + builder.put_file("", identity_cid(0), 1).unwrap(); + + let mut full_path = String::new(); + let mut buffer = Vec::new(); + + let iter = builder.build(&mut full_path, &mut buffer); + let actual = iter + .map(|res| res.map(|(p, _, _)| p)) + .collect::, _>>() + .unwrap(); + + assert!( + actual.is_empty(), + "wrapping in directory was not asked, single element" + ); + } + + #[test] + #[should_panic] + fn rooted_path() { + let mut builder = BufferingTreeBuilder::default(); + builder.put_file("/a", identity_cid(0), 1).unwrap(); + } + + #[test] + #[should_panic] + fn successive_slashes() { + let mut builder = BufferingTreeBuilder::default(); + builder.put_file("a//b", identity_cid(0), 1).unwrap(); + } + + #[test] + fn multiple_roots() { + let opts = TreeOptions::default().with_wrap_in_directory(); + let mut builder = BufferingTreeBuilder::new(opts); + builder.put_file("a", identity_cid(0), 1).unwrap(); + builder.put_file("b", identity_cid(1), 1).unwrap(); + + let mut full_path = String::new(); + let mut buffer = Vec::new(); + + let iter = builder.build(&mut full_path, &mut buffer); + let actual = iter + .map(|res| res.map(|(p, _, _)| p)) + .collect::, _>>() + .unwrap(); + + // FIXME: how to test that this element has two links? + assert_eq!(actual, &[identity_cid(2).to_string()]); + } + + #[test] + #[should_panic] + fn denied_multiple_root_dirs() { + let mut builder = BufferingTreeBuilder::default(); + builder.put_file("a/c.txt", identity_cid(0), 1).unwrap(); + builder.put_file("b/d.txt", identity_cid(1), 1).unwrap(); + } + + #[test] + #[should_panic] + fn denied_multiple_root_files() { + let mut builder = BufferingTreeBuilder::default(); + builder.put_file("a.txt", identity_cid(0), 1).unwrap(); + builder.put_file("b.txt", identity_cid(1), 1).unwrap(); + } + + #[test] + #[should_panic] + fn using_leaf_as_node() { + let mut builder = BufferingTreeBuilder::default(); + builder.put_file("a.txt", identity_cid(0), 1).unwrap(); + builder.put_file("a.txt/b.txt", identity_cid(1), 1).unwrap(); + } + + #[test] + fn set_metadata_before_files() { + let mut builder = BufferingTreeBuilder::default(); + builder + .set_metadata("a/b/c/d", Metadata::default()) + .unwrap(); + builder + .put_file("a/b/c/d/e.txt", identity_cid(1), 1) + .unwrap(); + builder + .put_file("a/b/c/d/f.txt", identity_cid(2), 1) + .unwrap(); + + let mut full_path = String::new(); + let mut buffer = Vec::new(); + + let iter = builder.build(&mut full_path, &mut buffer); + let actual = iter + .map(|res| res.map(|(p, _, _)| p)) + .collect::, _>>() + .unwrap(); + + assert_eq!(actual, &["a/b/c/d", "a/b/c", "a/b", "a",]) + } + + #[test] + fn set_metadata_on_file() { + let mut builder = BufferingTreeBuilder::default(); + builder.put_file("a/a.txt", identity_cid(0), 1).unwrap(); + let err = builder + .set_metadata("a/a.txt", Metadata::default()) + .unwrap_err(); + + assert!( + matches!(err, TreeBuildingFailed::LeafAsDirectory(_)), + "{:?}", + err + ); + } +} From de9247574d1774b088b2dacff51324de4e51dfdd Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 5 Aug 2020 13:26:52 +0300 Subject: [PATCH 02/57] feat: generate actual dag-pb --- unixfs/src/dir/builder.rs | 142 ++++++++++++++++++++++++++++---------- 1 file changed, 104 insertions(+), 38 deletions(-) diff --git a/unixfs/src/dir/builder.rs b/unixfs/src/dir/builder.rs index 3251fb608..a1d20251e 100644 --- a/unixfs/src/dir/builder.rs +++ b/unixfs/src/dir/builder.rs @@ -340,13 +340,6 @@ pub struct PostOrderIterator<'a> { wrap_in_directory: bool, } -fn identity_cid(number: usize) -> Cid { - use multihash::{Multihash, Sha2_256}; - - let mh = Sha2_256::digest(&number.to_le_bytes()); - Cid::new_v0(mh).unwrap() -} - #[derive(Debug)] pub enum TreeConstructionFailed { // TODO: at least any quick_protobuf errors here? @@ -362,12 +355,59 @@ impl std::error::Error for TreeConstructionFailed {} impl<'a> PostOrderIterator<'a> { fn render_directory( - links: &BTreeMap, - _buffer: &mut Vec, + links: &mut BTreeMap, + buffer: &mut Vec, ) -> Result { + use crate::pb::{FlatUnixFs, PBLink, UnixFs, UnixFsType}; + use quick_protobuf::{BytesWriter, MessageWrite, Writer}; + use sha2::{Digest, Sha256}; + use std::borrow::Cow; + + // TODO: this could quite easily be made so that the links are read from the btreemap for + // calculating the size and rendering + let mut combined_from_links = 0; + + let flat = FlatUnixFs { + links: links + .iter() // .drain() would be the most reasonable + .inspect(|(_, Leaf { total_size, .. })| combined_from_links += total_size) + .map(|(name, Leaf { link, total_size })| PBLink { + Hash: Some(link.to_bytes().into()), + Name: Some(Cow::Borrowed(name.as_str())), + Tsize: Some(*total_size), + }) + .collect::>(), + data: UnixFs { + Type: UnixFsType::Directory, + Data: None, + ..Default::default() + }, + }; + + let size = flat.get_size(); + + // FIXME: we shouldn't be creating too large structures (bitswap block size limit!) + // FIXME: changing this to autosharding is going to take some thinking + + buffer.clear(); + let cap = buffer.capacity(); + + if let Some(additional) = size.checked_sub(cap) { + buffer.reserve(additional); + } + + // argh + buffer.extend(std::iter::repeat(0).take(size)); + + let mut writer = Writer::new(BytesWriter::new(&mut buffer[..])); + flat.write_message(&mut writer) + .expect("unsure how this could fail"); + let mh = multihash::wrap(multihash::Code::Sha2_256, &Sha256::digest(&buffer)); + let cid = Cid::new_v0(mh).expect("sha2_256 is the correct multihash for cidv0"); + Ok(Leaf { - link: identity_cid(links.len()), - total_size: 42, + link: cid, + total_size: buffer.len() as u64 + combined_from_links, }) } @@ -448,7 +488,7 @@ impl<'a> PostOrderIterator<'a> { let buffer = &mut self.block_buffer; buffer.clear(); - let leaf = match Self::render_directory(&collected, buffer) { + let leaf = match Self::render_directory(&mut collected, buffer) { Ok(leaf) => leaf, Err(e) => return Some(Err(e)), }; @@ -586,7 +626,7 @@ pub struct Metadata { #[cfg(test)] mod tests { - use super::{identity_cid, BufferingTreeBuilder, Metadata, TreeBuildingFailed, TreeOptions}; + use super::{BufferingTreeBuilder, Metadata, TreeBuildingFailed, TreeOptions}; use cid::Cid; use std::convert::TryFrom; @@ -599,13 +639,13 @@ mod tests { Cid::try_from("QmRJHYTNvC3hmd9gJQARxLR1QMEincccBV53bBw524yyq6").unwrap(); builder - .put_file("a/b/c/d/e/f/g.txt", five_block_foobar.clone(), 1) + .put_file("a/b/c/d/e/f/g.txt", five_block_foobar.clone(), 221) .unwrap(); builder - .put_file("a/b/c/d/e/h.txt", five_block_foobar.clone(), 2) + .put_file("a/b/c/d/e/h.txt", five_block_foobar.clone(), 221) .unwrap(); builder - .put_file("a/b/c/d/e/i.txt", five_block_foobar.clone(), 3) + .put_file("a/b/c/d/e/i.txt", five_block_foobar.clone(), 221) .unwrap(); let mut full_path = String::new(); @@ -613,7 +653,7 @@ mod tests { let iter = builder.build(&mut full_path, &mut buffer); let mut actual = iter - .map(|res| res.map(|(p, cid, _)| (p, cid))) + .map(|res| res.map(|(p, cid, buf)| (p, cid, buf))) .collect::, _>>() .unwrap(); @@ -640,14 +680,32 @@ mod tests { while let Some(actual) = actual.pop() { let expected = expected.pop().expect("size mismatch"); assert_eq!(actual.0, expected.0); - assert_eq!(actual.1.to_string(), expected.1, "{:?}", actual.0); + assert_eq!( + actual.1.to_string(), + expected.1, + "{:?}: {:?}", + actual.0, + Hex(&actual.2) + ); + } + } + + struct Hex<'a>(&'a [u8]); + use std::fmt; + + impl<'a> fmt::Debug for Hex<'a> { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + for b in self.0 { + write!(fmt, "{:02x}", b)?; + } + Ok(()) } } #[test] fn empty_path() { let mut builder = BufferingTreeBuilder::default(); - builder.put_file("", identity_cid(0), 1).unwrap(); + builder.put_file("", some_cid(0), 1).unwrap(); let mut full_path = String::new(); let mut buffer = Vec::new(); @@ -668,22 +726,28 @@ mod tests { #[should_panic] fn rooted_path() { let mut builder = BufferingTreeBuilder::default(); - builder.put_file("/a", identity_cid(0), 1).unwrap(); + builder.put_file("/a", some_cid(0), 1).unwrap(); } #[test] #[should_panic] fn successive_slashes() { let mut builder = BufferingTreeBuilder::default(); - builder.put_file("a//b", identity_cid(0), 1).unwrap(); + builder.put_file("a//b", some_cid(0), 1).unwrap(); } #[test] fn multiple_roots() { + // foobar\n + let five_block_foobar = + Cid::try_from("QmRJHYTNvC3hmd9gJQARxLR1QMEincccBV53bBw524yyq6").unwrap(); + let opts = TreeOptions::default().with_wrap_in_directory(); let mut builder = BufferingTreeBuilder::new(opts); - builder.put_file("a", identity_cid(0), 1).unwrap(); - builder.put_file("b", identity_cid(1), 1).unwrap(); + builder + .put_file("a", five_block_foobar.clone(), 221) + .unwrap(); + builder.put_file("b", five_block_foobar, 221).unwrap(); let mut full_path = String::new(); let mut buffer = Vec::new(); @@ -694,32 +758,31 @@ mod tests { .collect::, _>>() .unwrap(); - // FIXME: how to test that this element has two links? - assert_eq!(actual, &[identity_cid(2).to_string()]); + assert_eq!(actual, &["QmdbWuhpVCX9weVMMqvVTMeGwKMqCNJDbx7ZK1zG36sea7"]); } #[test] #[should_panic] fn denied_multiple_root_dirs() { let mut builder = BufferingTreeBuilder::default(); - builder.put_file("a/c.txt", identity_cid(0), 1).unwrap(); - builder.put_file("b/d.txt", identity_cid(1), 1).unwrap(); + builder.put_file("a/c.txt", some_cid(0), 1).unwrap(); + builder.put_file("b/d.txt", some_cid(1), 1).unwrap(); } #[test] #[should_panic] fn denied_multiple_root_files() { let mut builder = BufferingTreeBuilder::default(); - builder.put_file("a.txt", identity_cid(0), 1).unwrap(); - builder.put_file("b.txt", identity_cid(1), 1).unwrap(); + builder.put_file("a.txt", some_cid(0), 1).unwrap(); + builder.put_file("b.txt", some_cid(1), 1).unwrap(); } #[test] #[should_panic] fn using_leaf_as_node() { let mut builder = BufferingTreeBuilder::default(); - builder.put_file("a.txt", identity_cid(0), 1).unwrap(); - builder.put_file("a.txt/b.txt", identity_cid(1), 1).unwrap(); + builder.put_file("a.txt", some_cid(0), 1).unwrap(); + builder.put_file("a.txt/b.txt", some_cid(1), 1).unwrap(); } #[test] @@ -728,12 +791,8 @@ mod tests { builder .set_metadata("a/b/c/d", Metadata::default()) .unwrap(); - builder - .put_file("a/b/c/d/e.txt", identity_cid(1), 1) - .unwrap(); - builder - .put_file("a/b/c/d/f.txt", identity_cid(2), 1) - .unwrap(); + builder.put_file("a/b/c/d/e.txt", some_cid(1), 1).unwrap(); + builder.put_file("a/b/c/d/f.txt", some_cid(2), 1).unwrap(); let mut full_path = String::new(); let mut buffer = Vec::new(); @@ -750,7 +809,7 @@ mod tests { #[test] fn set_metadata_on_file() { let mut builder = BufferingTreeBuilder::default(); - builder.put_file("a/a.txt", identity_cid(0), 1).unwrap(); + builder.put_file("a/a.txt", some_cid(0), 1).unwrap(); let err = builder .set_metadata("a/a.txt", Metadata::default()) .unwrap_err(); @@ -761,4 +820,11 @@ mod tests { err ); } + + /// Returns a quick and dirty sha2-256 of the given number as a Cidv0 + fn some_cid(number: usize) -> Cid { + use multihash::Sha2_256; + let mh = Sha2_256::digest(&number.to_le_bytes()); + Cid::new_v0(mh).unwrap() + } } From 31e2a5ee5a84e270db029e47cb522c1b7adfca69 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 5 Aug 2020 13:30:26 +0300 Subject: [PATCH 03/57] fix: remove unneeded mut in render_directory --- unixfs/src/dir/builder.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unixfs/src/dir/builder.rs b/unixfs/src/dir/builder.rs index a1d20251e..243dad2b2 100644 --- a/unixfs/src/dir/builder.rs +++ b/unixfs/src/dir/builder.rs @@ -355,7 +355,7 @@ impl std::error::Error for TreeConstructionFailed {} impl<'a> PostOrderIterator<'a> { fn render_directory( - links: &mut BTreeMap, + links: &BTreeMap, buffer: &mut Vec, ) -> Result { use crate::pb::{FlatUnixFs, PBLink, UnixFs, UnixFsType}; @@ -488,7 +488,7 @@ impl<'a> PostOrderIterator<'a> { let buffer = &mut self.block_buffer; buffer.clear(); - let leaf = match Self::render_directory(&mut collected, buffer) { + let leaf = match Self::render_directory(&collected, buffer) { Ok(leaf) => leaf, Err(e) => return Some(Err(e)), }; From c59a4cbe221b973836cc8241e916a4710d71ee84 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 5 Aug 2020 17:10:27 +0300 Subject: [PATCH 04/57] refactor: try make /add streaming --- http/src/v0/root_files/add.rs | 137 ++++++++++++++++++++++------------ 1 file changed, 91 insertions(+), 46 deletions(-) diff --git a/http/src/v0/root_files/add.rs b/http/src/v0/root_files/add.rs index 402873ffe..b3e96e4fe 100644 --- a/http/src/v0/root_files/add.rs +++ b/http/src/v0/root_files/add.rs @@ -18,7 +18,7 @@ pub(super) async fn add_inner( body: impl Stream> + Unpin, ) -> Result { // FIXME: this should be without adder at least - use ipfs::unixfs::ll::file::adder::FileAdder; + use ipfs::unixfs::ll::{dir::builder::BufferingTreeBuilder, file::adder::FileAdder}; let boundary = content_type .get_param("boundary") @@ -28,9 +28,12 @@ pub(super) async fn add_inner( let mut stream = MultipartStream::new(Bytes::from(boundary), body.map_ok(|mut buf| buf.to_bytes())); + // TODO: wrap-in-directory option + let mut tree = BufferingTreeBuilder::default(); + // this should be a while loop but clippy will warn if this is a while loop which will only get // executed once. - if let Some(mut field) = stream + while let Some(mut field) = stream .try_next() .await .map_err(|e| StringError::from(format!("IO error: {}", e)))? @@ -40,6 +43,7 @@ pub(super) async fn add_inner( .map_err(|e| StringError::from(format!("unparseable headers: {}", e)))?; if field_name != "file" { + // this seems constant for files and directories return Err(StringError::from(format!("unsupported field: {}", field_name)).into()); } @@ -48,54 +52,95 @@ pub(super) async fn add_inner( .map_err(|e| StringError::from(format!("unparseable filename: {}", e)))? .to_string(); - let mut adder = FileAdder::default(); - let mut total = 0u64; - - loop { - let next = field - .try_next() - .await - .map_err(|e| StringError::from(format!("IO error: {}", e)))?; - - match next { - Some(next) => { - let mut read = 0usize; - while read < next.len() { - let (iter, used) = adder.push(&next.slice(read..)); - read += used; - - let maybe_tuple = import_all(&ipfs, iter).await.map_err(|e| { - StringError::from(format!("Failed to save blocks: {}", e)) - })?; - - total += maybe_tuple.map(|t| t.1).unwrap_or(0); + // unixfsv1.5 metadata seems to be in custom headers for both files and additional + // directories: + // - mtime: timespec + // - mtime-nsecs: timespec + // + // should probably read the metadata here to have it available for both files and + // directories? + // + // FIXME: tomorrow: + // - need to make this a stream + // - need to yield progress reports + // - before yielding file results, we should add it to builder + // - finally at the end we should build the tree + + let content_type = field + .content_type() + .map_err(|e| StringError::from(format!("unparseable content-type: {}", e)))?; + + if content_type == "application/octet-stream" { + // Content-Type: application/octet-stream for files + let mut adder = FileAdder::default(); + let mut total = 0u64; + + loop { + let next = field + .try_next() + .await + .map_err(|e| StringError::from(format!("IO error: {}", e)))?; + + match next { + Some(next) => { + let mut read = 0usize; + while read < next.len() { + let (iter, used) = adder.push(&next.slice(read..)); + read += used; + + let maybe_tuple = import_all(&ipfs, iter).await.map_err(|e| { + StringError::from(format!("Failed to save blocks: {}", e)) + })?; + + total += maybe_tuple.map(|t| t.1).unwrap_or(0); + } } + None => break, } - None => break, } - } - let (root, subtotal) = import_all(&ipfs, adder.finish()) - .await - .map_err(|e| StringError::from(format!("Failed to save blocks: {}", e)))? - .expect("I think there should always be something from finish -- except if the link block has just been compressed?"); - - total += subtotal; - - let root = root.to_string(); - - let filename: Cow<'_, str> = if filename.is_empty() { - // cid needs to be repeated if no filename was given - Cow::Borrowed(&root) + let (root, subtotal) = import_all(&ipfs, adder.finish()) + .await + .map_err(|e| StringError::from(format!("Failed to save blocks: {}", e)))? + .expect("I think there should always be something from finish -- except if the link block has just been compressed?"); + + total += subtotal; + + // using the filename as the path since we can tolerate a single empty named file + // however the second one will cause issues + tree.put_file(filename.as_ref().unwrap_or_default(), root, total) + .map_err(|e| { + StringError::from(format!("Failed to record file in the tree: {}", e)) + })?; + + let root = root.to_string(); + + let filename: Cow<'_, str> = if filename.is_empty() { + // cid needs to be repeated if no filename was given + Cow::Borrowed(&root) + } else { + Cow::Owned(filename) + }; + + return Ok(warp::reply::json(&Response::Added { + name: filename, + hash: Cow::Borrowed(&root), + size: Quoted(total), + })); + } else if content_type == "application/x-directory" { + // Content-Type: application/x-directory for additional directories or for setting + // metadata on them + return Err(StringError::from(format!( + "not implemented: {}", + content_type + ))); } else { - Cow::Owned(filename) - }; - - return Ok(warp::reply::json(&Response::Added { - name: filename, - hash: Cow::Borrowed(&root), - size: Quoted(total), - })); + // should be 405? + return Err(StringError::from(format!( + "unsupported content-type: {}", + content_type + ))); + } } Err(StringError::from("not implemented").into()) @@ -194,7 +239,7 @@ mod tests { assert_eq!( body, - r#"{"Hash":"Qma4hjFTnCasJ8PVp3mZbZK5g2vGDT4LByLJ7m8ciyRFZP","Name":"testfile.txt","Size":"20"}"# + "{\"Hash\":\"Qma4hjFTnCasJ8PVp3mZbZK5g2vGDT4LByLJ7m8ciyRFZP\",\"Name\":\"testfile.txt\",\"Size\":\"20\"}\r\n" ); } From ccd6bbe248e9e51318ccd495a463991a84978e0d Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 5 Aug 2020 20:36:56 +0300 Subject: [PATCH 05/57] feat: build directory trees on /add --- Cargo.lock | 1 + http/Cargo.toml | 1 + http/src/v0/root_files/add.rs | 296 +++++++++++++++++++++------------- unixfs/src/dir/builder.rs | 14 +- 4 files changed, 194 insertions(+), 118 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 35a589979..1b4633d50 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1183,6 +1183,7 @@ dependencies = [ name = "ipfs-http" version = "0.1.0" dependencies = [ + "anyhow", "async-stream", "bytes 0.5.6", "cid", diff --git a/http/Cargo.toml b/http/Cargo.toml index d93dde55f..4c5b20477 100644 --- a/http/Cargo.toml +++ b/http/Cargo.toml @@ -10,6 +10,7 @@ prost-build = { default-features = false, version = "0.6" } vergen = { default-features = false, version = "3.1" } [dependencies] +anyhow = "*" async-stream = { default-features = false, version = "0.3" } bytes = { default-features = false, version = "0.5" } cid = { default-features = false, version = "0.5" } diff --git a/http/src/v0/root_files/add.rs b/http/src/v0/root_files/add.rs index b3e96e4fe..e58652b66 100644 --- a/http/src/v0/root_files/add.rs +++ b/http/src/v0/root_files/add.rs @@ -1,11 +1,15 @@ use super::AddArgs; use crate::v0::support::StringError; -use bytes::{Buf, Bytes}; +use bytes::{buf::BufMutExt, Buf, BufMut, Bytes, BytesMut}; use cid::Cid; -use futures::stream::{Stream, TryStreamExt}; -use ipfs::{Ipfs, IpfsTypes}; +use futures::stream::{Stream, StreamExt, TryStreamExt}; +use ipfs::unixfs::ll::{ + dir::builder::{BufferingTreeBuilder, TreeBuildingFailed, TreeConstructionFailed}, + file::adder::FileAdder, +}; +use ipfs::{Block, Ipfs, IpfsTypes}; use mime::Mime; -use mpart_async::server::MultipartStream; +use mpart_async::server::{MultipartError, MultipartStream}; use serde::Serialize; use std::borrow::Cow; use std::fmt; @@ -15,142 +19,206 @@ pub(super) async fn add_inner( ipfs: Ipfs, _opts: AddArgs, content_type: Mime, - body: impl Stream> + Unpin, + body: impl Stream> + Send + Unpin + 'static, ) -> Result { - // FIXME: this should be without adder at least - use ipfs::unixfs::ll::{dir::builder::BufferingTreeBuilder, file::adder::FileAdder}; - let boundary = content_type .get_param("boundary") .map(|v| v.to_string()) .ok_or_else(|| StringError::from("missing 'boundary' on content-type"))?; - let mut stream = - MultipartStream::new(Bytes::from(boundary), body.map_ok(|mut buf| buf.to_bytes())); + let stream = MultipartStream::new(Bytes::from(boundary), body.map_ok(|mut buf| buf.to_bytes())); - // TODO: wrap-in-directory option - let mut tree = BufferingTreeBuilder::default(); + // Stream> + // + // refine it to + // + // Stream> + // | | + // | convert rejection and stop the stream? + // | | + // | / + // Stream, impl std::error::Error + Send + Sync + 'static>> - // this should be a while loop but clippy will warn if this is a while loop which will only get - // executed once. - while let Some(mut field) = stream - .try_next() - .await - .map_err(|e| StringError::from(format!("IO error: {}", e)))? - { - let field_name = field - .name() - .map_err(|e| StringError::from(format!("unparseable headers: {}", e)))?; + let st = add_stream(ipfs, stream); - if field_name != "file" { - // this seems constant for files and directories - return Err(StringError::from(format!("unsupported field: {}", field_name)).into()); - } + // TODO: we could map the errors into json objects at least? (as we cannot return them as + // trailers) + + let body = crate::v0::support::StreamResponse(st); + + Ok(body) +} + +#[derive(Debug)] +enum AddError { + Parsing(MultipartError), + Header(MultipartError), + InvalidFilename(std::str::Utf8Error), + UnsupportedField(String), + UnsupportedContentType(String), + ResponseSerialization(serde_json::Error), + Persisting(ipfs::Error), + TreeGathering(TreeBuildingFailed), + TreeBuilding(TreeConstructionFailed), +} + +impl From for AddError { + fn from(e: MultipartError) -> AddError { + AddError::Parsing(e) + } +} + +impl fmt::Display for AddError { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + // TODO + write!(fmt, "{:?}", self) + } +} + +impl std::error::Error for AddError {} + +fn add_stream( + ipfs: Ipfs, + mut fields: MultipartStream, +) -> impl Stream> + Send + 'static +where + St: Stream> + Send + Unpin + 'static, + E: Into + Send + 'static, +{ + async_stream::try_stream! { + // TODO: wrap-in-directory option + let mut tree = BufferingTreeBuilder::default(); + + let mut buffer = BytesMut::new(); - let filename = field - .filename() - .map_err(|e| StringError::from(format!("unparseable filename: {}", e)))? - .to_string(); - - // unixfsv1.5 metadata seems to be in custom headers for both files and additional - // directories: - // - mtime: timespec - // - mtime-nsecs: timespec - // - // should probably read the metadata here to have it available for both files and - // directories? - // - // FIXME: tomorrow: - // - need to make this a stream - // - need to yield progress reports - // - before yielding file results, we should add it to builder - // - finally at the end we should build the tree - - let content_type = field - .content_type() - .map_err(|e| StringError::from(format!("unparseable content-type: {}", e)))?; - - if content_type == "application/octet-stream" { - // Content-Type: application/octet-stream for files - let mut adder = FileAdder::default(); - let mut total = 0u64; - - loop { - let next = field - .try_next() - .await - .map_err(|e| StringError::from(format!("IO error: {}", e)))?; - - match next { - Some(next) => { - let mut read = 0usize; - while read < next.len() { - let (iter, used) = adder.push(&next.slice(read..)); - read += used; - - let maybe_tuple = import_all(&ipfs, iter).await.map_err(|e| { - StringError::from(format!("Failed to save blocks: {}", e)) - })?; - - total += maybe_tuple.map(|t| t.1).unwrap_or(0); + tracing::trace!("stream started"); + + while let Some(mut field) = fields + .try_next() + .await? + { + + let field_name = field.name().map_err(AddError::Header)?; + + // files are file{,-1,-2,-3,..} + // directories are dir{,-1,-2,-3,..} + + let _ = if !field_name.starts_with("file") { + // this seems constant for files and directories + Err(AddError::UnsupportedField(field_name.to_string())) + } else { + // this is a bit ackward with the ? operator but it should save us the yield + // Err(..) followed by return; this is only available in the `stream!` variant, + // which continues after errors by default.. + Ok(()) + }?; + + let filename = field.filename().map_err(AddError::Header)?; + let filename = percent_encoding::percent_decode_str(filename) + .decode_utf8() + .map(|cow| cow.into_owned()) + .map_err(AddError::InvalidFilename)?; + + let content_type = field.content_type().map_err(AddError::Header)?; + + let next = match content_type { + "application/octet-stream" => { + tracing::trace!("processing file {:?}", filename); + let mut adder = FileAdder::default(); + let mut total = 0u64; + + loop { + let next = field + .try_next() + .await + .map_err(AddError::Parsing)?; + + match next { + Some(next) => { + let mut read = 0usize; + while read < next.len() { + let (iter, used) = adder.push(&next.slice(read..)); + read += used; + + let maybe_tuple = import_all(&ipfs, iter).await.map_err(AddError::Persisting)?; + + total += maybe_tuple.map(|t| t.1).unwrap_or(0); + } + + tracing::trace!("read {} bytes", read); + } + None => break, } } - None => break, + + let (root, subtotal) = import_all(&ipfs, adder.finish()) + .await + .map_err(AddError::Persisting)? + .expect("I think there should always be something from finish -- except if the link block has just been compressed?"); + + total += subtotal; + + tracing::trace!("completed processing file of {} bytes: {:?}", total, filename); + + // using the filename as the path since we can tolerate a single empty named file + // however the second one will cause issues + tree.put_file(&filename, root.clone(), total) + .map_err(AddError::TreeGathering)?; + + let filename: Cow<'_, str> = if filename.is_empty() { + // cid needs to be repeated if no filename was given + Cow::Owned(root.to_string()) + } else { + Cow::Owned(filename) + }; + + serde_json::to_writer((&mut buffer).writer(), &Response::Added { + name: filename, + hash: Quoted(&root), + size: Quoted(total), + }).map_err(AddError::ResponseSerialization)?; + + buffer.put(&b"\r\n"[..]); + + Ok(buffer.split().freeze()) + }, + /*"application/x-directory" + |*/ unsupported => { + Err(AddError::UnsupportedContentType(unsupported.to_string())) } - } + }?; - let (root, subtotal) = import_all(&ipfs, adder.finish()) - .await - .map_err(|e| StringError::from(format!("Failed to save blocks: {}", e)))? - .expect("I think there should always be something from finish -- except if the link block has just been compressed?"); + yield next; + } - total += subtotal; + let mut full_path = String::new(); + let mut block_buffer = Vec::new(); - // using the filename as the path since we can tolerate a single empty named file - // however the second one will cause issues - tree.put_file(filename.as_ref().unwrap_or_default(), root, total) - .map_err(|e| { - StringError::from(format!("Failed to record file in the tree: {}", e)) - })?; + let mut iter = tree.build(&mut full_path, &mut block_buffer); - let root = root.to_string(); + while let Some(res) = iter.next_borrowed() { + let (path, cid, total, block) = res.map_err(AddError::TreeBuilding)?; - let filename: Cow<'_, str> = if filename.is_empty() { - // cid needs to be repeated if no filename was given - Cow::Borrowed(&root) - } else { - Cow::Owned(filename) - }; + // shame we need to allocate once again here.. + ipfs.put_block(Block { cid: cid.to_owned(), data: block.into() }).await.map_err(AddError::Persisting)?; - return Ok(warp::reply::json(&Response::Added { - name: filename, - hash: Cow::Borrowed(&root), + serde_json::to_writer((&mut buffer).writer(), &Response::Added { + name: Cow::Borrowed(path), + hash: Quoted(cid), size: Quoted(total), - })); - } else if content_type == "application/x-directory" { - // Content-Type: application/x-directory for additional directories or for setting - // metadata on them - return Err(StringError::from(format!( - "not implemented: {}", - content_type - ))); - } else { - // should be 405? - return Err(StringError::from(format!( - "unsupported content-type: {}", - content_type - ))); + }).map_err(AddError::ResponseSerialization)?; + + buffer.put(&b"\r\n"[..]); + + yield buffer.split().freeze(); } } - - Err(StringError::from("not implemented").into()) } async fn import_all( ipfs: &Ipfs, iter: impl Iterator)>, ) -> Result, ipfs::Error> { - use ipfs::Block; // TODO: use FuturesUnordered let mut last: Option = None; let mut total = 0u64; @@ -188,10 +256,10 @@ enum Response<'a> { #[serde(rename_all = "PascalCase")] Added { /// The resulting Cid as a string. - hash: Cow<'a, str>, + hash: Quoted<&'a Cid>, /// Name of the file added from filename or the resulting Cid. name: Cow<'a, str>, - /// Stringified version of the total size in bytes. + /// Stringified version of the total cumulative size in bytes. size: Quoted, }, } diff --git a/unixfs/src/dir/builder.rs b/unixfs/src/dir/builder.rs index 243dad2b2..3ec37bcb9 100644 --- a/unixfs/src/dir/builder.rs +++ b/unixfs/src/dir/builder.rs @@ -243,7 +243,7 @@ impl BufferingTreeBuilder { /// Returned `PostOrderIterator` will use the given `full_path` and `block_buffer` to store /// it's data during the walk. `PostOrderIterator` implements `Iterator` while also allowing /// borrowed access via `next_borrowed`. - fn build<'a>( + pub fn build<'a>( self, full_path: &'a mut String, block_buffer: &'a mut Vec, @@ -263,6 +263,7 @@ impl BufferingTreeBuilder { persisted_cids: Default::default(), reused_children: Vec::new(), cid: None, + total_size: 0, wrap_in_directory: self.opts.wrap_in_directory, } } @@ -336,6 +337,7 @@ pub struct PostOrderIterator<'a> { persisted_cids: HashMap, BTreeMap>, reused_children: Vec, cid: Option, + total_size: u64, // from TreeOptions wrap_in_directory: bool, } @@ -411,9 +413,9 @@ impl<'a> PostOrderIterator<'a> { }) } - fn next_borrowed<'b>( + pub fn next_borrowed<'b>( &'b mut self, - ) -> Option> { + ) -> Option> { while let Some(visited) = self.pending.pop() { let (name, depth) = match &visited { Visited::Descent { name, depth, .. } => (name.as_deref(), *depth), @@ -494,6 +496,7 @@ impl<'a> PostOrderIterator<'a> { }; self.cid = Some(leaf.link.clone()); + self.total_size = leaf.total_size; // this reuse strategy is probably good enough collected.clear(); @@ -525,6 +528,7 @@ impl<'a> PostOrderIterator<'a> { return Some(Ok(( self.full_path.as_str(), self.cid.as_ref().unwrap(), + self.total_size, &self.block_buffer, ))); } @@ -539,7 +543,9 @@ impl<'a> Iterator for PostOrderIterator<'a> { fn next(&mut self) -> Option { self.next_borrowed().map(|res| { - res.map(|(full_path, cid, block)| (full_path.to_string(), cid.to_owned(), block.into())) + res.map(|(full_path, cid, _, block)| { + (full_path.to_string(), cid.to_owned(), block.into()) + }) }) } } From f0bb5b41fdeee9efe41083c7b4e7f003d0d1a871 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 5 Aug 2020 21:10:41 +0300 Subject: [PATCH 06/57] feat: create empty directories without metadata this is the go-ipfs 0.5 level support for add and directories. next up will be parsing the headers as unixfsv1.5 metadata and using those with the directories *and* files. --- http/src/v0/root_files/add.rs | 53 ++++++++++++++++++++--------------- unixfs/src/dir/builder.rs | 7 +---- 2 files changed, 31 insertions(+), 29 deletions(-) diff --git a/http/src/v0/root_files/add.rs b/http/src/v0/root_files/add.rs index e58652b66..a26f658cb 100644 --- a/http/src/v0/root_files/add.rs +++ b/http/src/v0/root_files/add.rs @@ -88,31 +88,14 @@ where async_stream::try_stream! { // TODO: wrap-in-directory option let mut tree = BufferingTreeBuilder::default(); - let mut buffer = BytesMut::new(); - tracing::trace!("stream started"); - while let Some(mut field) = fields .try_next() .await? { let field_name = field.name().map_err(AddError::Header)?; - - // files are file{,-1,-2,-3,..} - // directories are dir{,-1,-2,-3,..} - - let _ = if !field_name.starts_with("file") { - // this seems constant for files and directories - Err(AddError::UnsupportedField(field_name.to_string())) - } else { - // this is a bit ackward with the ? operator but it should save us the yield - // Err(..) followed by return; this is only available in the `stream!` variant, - // which continues after errors by default.. - Ok(()) - }?; - let filename = field.filename().map_err(AddError::Header)?; let filename = percent_encoding::percent_decode_str(filename) .decode_utf8() @@ -123,7 +106,14 @@ where let next = match content_type { "application/octet-stream" => { - tracing::trace!("processing file {:?}", filename); + + // files are file{,-1,-2,-3,..} + let _ = if field_name != "file" && !field_name.starts_with("file-") { + Err(AddError::UnsupportedField(field_name.to_string())) + } else { + Ok(()) + }?; + let mut adder = FileAdder::default(); let mut total = 0u64; @@ -144,8 +134,6 @@ where total += maybe_tuple.map(|t| t.1).unwrap_or(0); } - - tracing::trace!("read {} bytes", read); } None => break, } @@ -166,7 +154,8 @@ where .map_err(AddError::TreeGathering)?; let filename: Cow<'_, str> = if filename.is_empty() { - // cid needs to be repeated if no filename was given + // cid needs to be repeated if no filename was given; in which case there + // should not be anything to build as tree either. Cow::Owned(root.to_string()) } else { Cow::Owned(filename) @@ -182,8 +171,26 @@ where Ok(buffer.split().freeze()) }, - /*"application/x-directory" - |*/ unsupported => { + "application/x-directory" => { + // dirs are dir{,-1,-2,-3,..} + let _ = if field_name != "dir" && !field_name.starts_with("dir-") { + Err(AddError::UnsupportedField(field_name.to_string())) + } else { + Ok(()) + }?; + + // we need to fully consume this part, even though there shouldn't be anything + // except for the already parsed *but* ignored headers + while let Some(_) = field.try_next().await.map_err(AddError::Parsing)? {} + + // while we don't at the moment parse the mtime, mtime-nsec headers and mode + // those should be reflected in the metadata. this will still add an empty + // directory which is good thing. + tree.set_metadata(&filename, ipfs::unixfs::ll::Metadata::default()) + .map_err(AddError::TreeGathering)?; + continue; + } + unsupported => { Err(AddError::UnsupportedContentType(unsupported.to_string())) } }?; diff --git a/unixfs/src/dir/builder.rs b/unixfs/src/dir/builder.rs index 3ec37bcb9..f21b46531 100644 --- a/unixfs/src/dir/builder.rs +++ b/unixfs/src/dir/builder.rs @@ -1,3 +1,4 @@ +use crate::Metadata; use cid::Cid; use std::collections::hash_map::Entry::*; use std::collections::{BTreeMap, HashMap}; @@ -624,12 +625,6 @@ impl DirBuilder { } } -#[derive(Default, Debug)] -pub struct Metadata { - mtime: Option<(i64, u32)>, - mode: Option, -} - #[cfg(test)] mod tests { use super::{BufferingTreeBuilder, Metadata, TreeBuildingFailed, TreeOptions}; From bb9555b14eb201f9898b2a4b57d647e51ed41b87 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 5 Aug 2020 21:11:08 +0300 Subject: [PATCH 07/57] doc: explain anyhow dependency --- http/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/http/Cargo.toml b/http/Cargo.toml index 4c5b20477..bccbcc8b5 100644 --- a/http/Cargo.toml +++ b/http/Cargo.toml @@ -10,7 +10,7 @@ prost-build = { default-features = false, version = "0.6" } vergen = { default-features = false, version = "3.1" } [dependencies] -anyhow = "*" +anyhow = "*" # temporarily needed until the next release of mpart-async async-stream = { default-features = false, version = "0.3" } bytes = { default-features = false, version = "0.5" } cid = { default-features = false, version = "0.5" } From 548dbe2873c983ac9edeb8188b2c6ffb7cbb1599 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 5 Aug 2020 21:49:23 +0300 Subject: [PATCH 08/57] chore: appease clippy single while let Some(_) = ... changed to is_some(). --- http/src/v0/root_files/add.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/http/src/v0/root_files/add.rs b/http/src/v0/root_files/add.rs index a26f658cb..20ad6730f 100644 --- a/http/src/v0/root_files/add.rs +++ b/http/src/v0/root_files/add.rs @@ -2,7 +2,7 @@ use super::AddArgs; use crate::v0::support::StringError; use bytes::{buf::BufMutExt, Buf, BufMut, Bytes, BytesMut}; use cid::Cid; -use futures::stream::{Stream, StreamExt, TryStreamExt}; +use futures::stream::{Stream, TryStreamExt}; use ipfs::unixfs::ll::{ dir::builder::{BufferingTreeBuilder, TreeBuildingFailed, TreeConstructionFailed}, file::adder::FileAdder, @@ -181,7 +181,7 @@ where // we need to fully consume this part, even though there shouldn't be anything // except for the already parsed *but* ignored headers - while let Some(_) = field.try_next().await.map_err(AddError::Parsing)? {} + while field.try_next().await.map_err(AddError::Parsing)?.is_some() {} // while we don't at the moment parse the mtime, mtime-nsec headers and mode // those should be reflected in the metadata. this will still add an empty From 6ba41955003358a7f30b068cc65eed5451f52a94 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 5 Aug 2020 21:58:53 +0300 Subject: [PATCH 09/57] refactor: split dir/builder.rs up * split off dir_builder.rs * split off PostOrderIterator * split bufferedtreebuilder off --- unixfs/src/dir/builder.rs | 728 +------------------------- unixfs/src/dir/builder/buffered.rs | 388 ++++++++++++++ unixfs/src/dir/builder/dir_builder.rs | 78 +++ unixfs/src/dir/builder/iter.rs | 285 ++++++++++ 4 files changed, 763 insertions(+), 716 deletions(-) create mode 100644 unixfs/src/dir/builder/buffered.rs create mode 100644 unixfs/src/dir/builder/dir_builder.rs create mode 100644 unixfs/src/dir/builder/iter.rs diff --git a/unixfs/src/dir/builder.rs b/unixfs/src/dir/builder.rs index f21b46531..662a07adb 100644 --- a/unixfs/src/dir/builder.rs +++ b/unixfs/src/dir/builder.rs @@ -1,8 +1,16 @@ use crate::Metadata; use cid::Cid; -use std::collections::hash_map::Entry::*; use std::collections::{BTreeMap, HashMap}; -use std::fmt::{self, Write}; +use std::fmt; + +mod dir_builder; +use dir_builder::{DirBuilder, DuplicateName, FoundLeaf}; + +mod iter; +pub use iter::PostOrderIterator; + +mod buffered; +pub use buffered::BufferingTreeBuilder; enum Entry { Leaf(Leaf), @@ -41,6 +49,7 @@ impl fmt::Debug for Leaf { } } +/// Configuration for customizing how the tree is built. #[derive(Default, Debug)] pub struct TreeOptions { wrap_in_directory: bool, @@ -87,189 +96,6 @@ impl fmt::Display for TreeBuildingFailed { impl std::error::Error for TreeBuildingFailed {} -#[derive(Debug)] -pub struct BufferingTreeBuilder { - /// At the root there can be only one element, unless an option was given to create a new - /// directory surrounding the root elements. - root_builder: DirBuilder, - longest_path: usize, - // used to generate each node an unique id which is used when doing the post order traversal to - // recover all childrens rendered Cids - counter: u64, - opts: TreeOptions, -} - -impl Default for BufferingTreeBuilder { - fn default() -> Self { - Self::new(TreeOptions::default()) - } -} - -impl BufferingTreeBuilder { - pub fn new(opts: TreeOptions) -> Self { - BufferingTreeBuilder { - root_builder: DirBuilder::root(0), - longest_path: 0, - counter: 1, - opts, - } - } - - // metadata has no bearing here - pub fn put_file( - &mut self, - full_path: &str, - target: Cid, - total_size: u64, - ) -> Result<(), TreeBuildingFailed> { - // inserted at the depth - let leaf = Leaf { - link: target, - total_size, - }; - - self.modify_with(full_path, |parent, basename, _| { - parent - .put_leaf(basename, leaf) - .map_err(|_| TreeBuildingFailed::DuplicatePath(full_path.to_string())) - }) - } - - /// Directories get "put" implicitly through the put files, and directories need to be adjusted - /// only when wanting them to have metadata. - pub fn set_metadata( - &mut self, - full_path: &str, - metadata: Metadata, - ) -> Result<(), TreeBuildingFailed> { - // create all paths along the way - // - // set if not set, error otherwise? FIXME: doesn't error atm - self.modify_with(full_path, |parent, basename, id| { - parent - .add_or_get_node(basename, id) - .map_err(|_| TreeBuildingFailed::LeafAsDirectory(full_path.to_string()))? - .set_metadata(metadata); - Ok(()) - }) - } - - fn modify_with(&mut self, full_path: &str, f: F) -> Result<(), TreeBuildingFailed> - where - F: FnOnce(&mut DirBuilder, String, &mut Option) -> Result<(), TreeBuildingFailed>, - { - // create all paths along the way - // - // assuming it's ok to split '/' since that cannot be escaped in linux at least - - self.longest_path = full_path.len().max(self.longest_path); - let mut remaining = full_path.split('/').enumerate().peekable(); - let mut dir_builder = &mut self.root_builder; - - // needed to avoid borrowing into the DirBuilder::new calling closure - let counter = &mut self.counter; - - while let Some((depth, next)) = remaining.next() { - let last = remaining.peek().is_none(); - - match (depth, next, last) { - // this might need to be accepted in case there is just a single file - (0, "", true) => { /* accepted */ } - (0, "", false) => { - return Err(TreeBuildingFailed::RootedPath(full_path.to_string())) - } - (_, "", false) => { - return Err(TreeBuildingFailed::RepeatSlashesInPath( - full_path.to_string(), - )) - } - (_, "", true) => todo!("path ends in slash"), - _ => {} - } - - // our first level can be full given the options - let full = depth == 0 && !self.opts.wrap_in_directory && dir_builder.is_empty(); - - if last { - let mut next_id = Some(*counter); - - let ret = if full { - Err(TreeBuildingFailed::TooManyRootLevelEntries) - } else { - f(dir_builder, next.to_string(), &mut next_id) - }; - - if next_id.is_none() { - *counter += 1; - } - - if ret.is_err() { - // FIXME: there might be a case where we have now stale nodes in our tree but - // cannot figure out an example for that. - } - - return ret; - } - - let parent_id = dir_builder.id; - - dir_builder = match (full, dir_builder.nodes.entry(next.to_string())) { - (_, Occupied(oe)) => oe - .into_mut() - .as_dir_builder() - .map_err(|_| TreeBuildingFailed::LeafAsDirectory(full_path.to_string()))?, - (false, Vacant(ve)) => { - let next_id = *counter; - *counter += 1; - ve.insert(Entry::Directory(DirBuilder::new(parent_id, next_id))) - .as_dir_builder() - .expect("safe: we just inserted a DirBuilder") - } - (true, Vacant(_)) => return Err(TreeBuildingFailed::TooManyRootLevelEntries), - }; - } - - // as the str::split will always return a single element this should not ever be hit - unreachable!( - "walked the full_path but failed to add anything: {:?}", - full_path - ); - } - - /// Called to build the tree. The built tree will have the added files and their implied - /// directory structure, along with the any directory entries which were created using - /// `set_metadata`. To build the whole hierarchy, one must iterate the returned iterator to - /// completion while storing the created blocks. - /// - /// Returned `PostOrderIterator` will use the given `full_path` and `block_buffer` to store - /// it's data during the walk. `PostOrderIterator` implements `Iterator` while also allowing - /// borrowed access via `next_borrowed`. - pub fn build<'a>( - self, - full_path: &'a mut String, - block_buffer: &'a mut Vec, - ) -> PostOrderIterator<'a> { - full_path.clear(); - block_buffer.clear(); - - PostOrderIterator { - full_path, - old_depth: 0, - block_buffer, - pending: vec![Visited::Descent { - node: self.root_builder, - name: None, - depth: 0, - }], - persisted_cids: Default::default(), - reused_children: Vec::new(), - cid: None, - total_size: 0, - wrap_in_directory: self.opts.wrap_in_directory, - } - } -} - #[derive(Debug)] enum Visited { Descent { @@ -286,63 +112,7 @@ enum Visited { }, } -fn update_full_path( - (full_path, old_depth): (&mut String, &mut usize), - name: Option<&str>, - depth: usize, -) { - if depth < 2 { - // initially thought it might be good idea to add slash to all components; removing it made - // it impossible to get back down to empty string, so fixing this for depths 0 and 1. - full_path.clear(); - *old_depth = 0; - } else { - while *old_depth >= depth && *old_depth > 0 { - // we now want to pop the last segment - // this would be easier with pathbuf - let slash_at = full_path.bytes().rposition(|ch| ch == b'/'); - if let Some(slash_at) = slash_at { - full_path.truncate(slash_at); - *old_depth -= 1; - } else { - todo!( - "no last slash_at in {:?} yet {} >= {}", - full_path, - old_depth, - depth - ); - } - } - } - - debug_assert!(*old_depth <= depth); - - if let Some(name) = name { - if !full_path.is_empty() { - full_path.push_str("/"); - } - full_path.push_str(name); - *old_depth += 1; - } - - assert_eq!(*old_depth, depth); -} - -pub struct PostOrderIterator<'a> { - full_path: &'a mut String, - old_depth: usize, - block_buffer: &'a mut Vec, - // our stack of pending work - pending: Vec, - // "communication channel" from nested entries back to their parents - persisted_cids: HashMap, BTreeMap>, - reused_children: Vec, - cid: Option, - total_size: u64, - // from TreeOptions - wrap_in_directory: bool, -} - +/// Failure cases for `PostOrderIterator` creating the tree dag-pb nodes. #[derive(Debug)] pub enum TreeConstructionFailed { // TODO: at least any quick_protobuf errors here? @@ -355,477 +125,3 @@ impl fmt::Display for TreeConstructionFailed { } impl std::error::Error for TreeConstructionFailed {} - -impl<'a> PostOrderIterator<'a> { - fn render_directory( - links: &BTreeMap, - buffer: &mut Vec, - ) -> Result { - use crate::pb::{FlatUnixFs, PBLink, UnixFs, UnixFsType}; - use quick_protobuf::{BytesWriter, MessageWrite, Writer}; - use sha2::{Digest, Sha256}; - use std::borrow::Cow; - - // TODO: this could quite easily be made so that the links are read from the btreemap for - // calculating the size and rendering - let mut combined_from_links = 0; - - let flat = FlatUnixFs { - links: links - .iter() // .drain() would be the most reasonable - .inspect(|(_, Leaf { total_size, .. })| combined_from_links += total_size) - .map(|(name, Leaf { link, total_size })| PBLink { - Hash: Some(link.to_bytes().into()), - Name: Some(Cow::Borrowed(name.as_str())), - Tsize: Some(*total_size), - }) - .collect::>(), - data: UnixFs { - Type: UnixFsType::Directory, - Data: None, - ..Default::default() - }, - }; - - let size = flat.get_size(); - - // FIXME: we shouldn't be creating too large structures (bitswap block size limit!) - // FIXME: changing this to autosharding is going to take some thinking - - buffer.clear(); - let cap = buffer.capacity(); - - if let Some(additional) = size.checked_sub(cap) { - buffer.reserve(additional); - } - - // argh - buffer.extend(std::iter::repeat(0).take(size)); - - let mut writer = Writer::new(BytesWriter::new(&mut buffer[..])); - flat.write_message(&mut writer) - .expect("unsure how this could fail"); - let mh = multihash::wrap(multihash::Code::Sha2_256, &Sha256::digest(&buffer)); - let cid = Cid::new_v0(mh).expect("sha2_256 is the correct multihash for cidv0"); - - Ok(Leaf { - link: cid, - total_size: buffer.len() as u64 + combined_from_links, - }) - } - - pub fn next_borrowed<'b>( - &'b mut self, - ) -> Option> { - while let Some(visited) = self.pending.pop() { - let (name, depth) = match &visited { - Visited::Descent { name, depth, .. } => (name.as_deref(), *depth), - Visited::Post { name, depth, .. } => (name.as_deref(), *depth), - }; - - update_full_path((self.full_path, &mut self.old_depth), name, depth); - - match visited { - Visited::Descent { node, name, depth } => { - let mut leaves = Vec::new(); - - let children = &mut self.reused_children; - - for (k, v) in node.nodes { - match v { - Entry::Directory(node) => children.push(Visited::Descent { - node, - name: Some(k), - depth: depth + 1, - }), - Entry::Leaf(leaf) => leaves.push((k, leaf)), - } - } - - self.pending.push(Visited::Post { - parent_id: node.parent_id, - id: node.id, - name, - depth, - leaves, - }); - - let any_children = !children.is_empty(); - - self.pending.extend(children.drain(..)); - - if any_children { - // we could strive to do everything right now but pushing and popping might - // turn out easier code wise, or in other words, when there are no child_nodes - // we wouldn't need to go through Visited::Post. - } - } - Visited::Post { - parent_id, - id, - name, - leaves, - .. - } => { - // all of our children have now been visited; we should be able to find their - // Cids in the btreemap - let mut collected = self.persisted_cids.remove(&Some(id)).unwrap_or_default(); - - // FIXME: leaves could be drained and reused - collected.extend(leaves); - - if !self.wrap_in_directory && parent_id.is_none() { - // we aren't supposed to wrap_in_directory, and we are now looking at the - // possibly to be generated root directory. - - assert_eq!( - collected.len(), - 1, - "should not have gone this far with multiple added roots" - ); - - return None; - } - - // render unixfs, maybe return it? - let buffer = &mut self.block_buffer; - buffer.clear(); - - let leaf = match Self::render_directory(&collected, buffer) { - Ok(leaf) => leaf, - Err(e) => return Some(Err(e)), - }; - - self.cid = Some(leaf.link.clone()); - self.total_size = leaf.total_size; - - // this reuse strategy is probably good enough - collected.clear(); - - if let Some(name) = name { - // name is none only for the wrap_in_directory, which cannot really be - // propagated up but still the parent_id is allowed to be None - let previous = self - .persisted_cids - .entry(parent_id) - .or_insert(collected) - .insert(name, leaf); - - assert!(previous.is_none()); - } - - if parent_id.is_none() { - // rewrite the full_path for the wrap_in_directory - assert!( - self.full_path.is_empty(), - "full_path should had been empty but it was not: {:?}", - self.full_path - ); - // at the wrap_in_directory level the name should be the root level Cid - write!(self.full_path, "{}", self.cid.as_ref().unwrap()).unwrap(); - self.old_depth += 1; - } - - return Some(Ok(( - self.full_path.as_str(), - self.cid.as_ref().unwrap(), - self.total_size, - &self.block_buffer, - ))); - } - } - } - None - } -} - -impl<'a> Iterator for PostOrderIterator<'a> { - type Item = Result<(String, Cid, Box<[u8]>), TreeConstructionFailed>; - - fn next(&mut self) -> Option { - self.next_borrowed().map(|res| { - res.map(|(full_path, cid, _, block)| { - (full_path.to_string(), cid.to_owned(), block.into()) - }) - }) - } -} - -struct DuplicateName; -struct FoundLeaf; - -/// Node in a directory tree. -#[derive(Debug)] -struct DirBuilder { - /// Immediate files, symlinks or directories in this directory - nodes: HashMap, - /// Metadata for this directory - metadata: Metadata, - /// Id of the parent; None for the root node - parent_id: Option, - /// Internal id, used for propagating Cids back from children during post order visit. - id: u64, -} - -impl DirBuilder { - fn new(parent_id: u64, id: u64) -> Self { - assert_ne!(parent_id, id); - DirBuilder { - nodes: HashMap::new(), - metadata: Default::default(), - parent_id: Some(parent_id), - id, - } - } - - fn root(id: u64) -> Self { - DirBuilder { - nodes: HashMap::new(), - metadata: Default::default(), - parent_id: None, - id, - } - } - - fn put_leaf(&mut self, key: String, leaf: Leaf) -> Result<(), DuplicateName> { - match self.nodes.entry(key) { - Occupied(_) => Err(DuplicateName), - Vacant(ve) => { - ve.insert(Entry::Leaf(leaf)); - Ok(()) - } - } - } - - fn add_or_get_node( - &mut self, - key: String, - id: &mut Option, - ) -> Result<&mut DirBuilder, FoundLeaf> { - match self.nodes.entry(key) { - Occupied(oe) => oe.into_mut().as_dir_builder().map_err(|_| FoundLeaf), - Vacant(ve) => { - let id = id.take().unwrap(); - let entry = ve.insert(Entry::Directory(Self::new(self.id, id))); - Ok(entry.as_dir_builder().expect("just inserted")) - } - } - } - - fn len(&self) -> usize { - self.nodes.len() - } - - fn is_empty(&self) -> bool { - self.len() != 0 - } - - fn set_metadata(&mut self, metadata: Metadata) { - self.metadata = metadata; - } -} - -#[cfg(test)] -mod tests { - use super::{BufferingTreeBuilder, Metadata, TreeBuildingFailed, TreeOptions}; - use cid::Cid; - use std::convert::TryFrom; - - #[test] - fn some_directories() { - let mut builder = BufferingTreeBuilder::default(); - - // foobar\n - let five_block_foobar = - Cid::try_from("QmRJHYTNvC3hmd9gJQARxLR1QMEincccBV53bBw524yyq6").unwrap(); - - builder - .put_file("a/b/c/d/e/f/g.txt", five_block_foobar.clone(), 221) - .unwrap(); - builder - .put_file("a/b/c/d/e/h.txt", five_block_foobar.clone(), 221) - .unwrap(); - builder - .put_file("a/b/c/d/e/i.txt", five_block_foobar.clone(), 221) - .unwrap(); - - let mut full_path = String::new(); - let mut buffer = Vec::new(); - - let iter = builder.build(&mut full_path, &mut buffer); - let mut actual = iter - .map(|res| res.map(|(p, cid, buf)| (p, cid, buf))) - .collect::, _>>() - .unwrap(); - - let mut expected = vec![ - ( - "a/b/c/d/e/f", - "Qmbgf44ztW9wLcGNRNYGinGQB6SQDQtbHVbkM5MrWms698", - ), - ( - "a/b/c/d/e", - "Qma1hCr3CuPRAq2Gw4DCNMqsi42Bjs4Bt1MGSS57kNh144", - ), - ("a/b/c/d", "QmUqaYatcJqiSFdykHXGh4Nog1eMSfDJBeYzcG67KV5Ri4"), - ("a/b/c", "QmYwaNBaGpDCNN9XpHmjxVPHmEXZMw9KDY3uikE2UU5fVB"), - ("a/b", "QmeAzCPig4o4gBLh2LvP96Sr8MUBrsu2Scw9MTq1EvTDhY"), - ("a", "QmSTUFaPwJW8xD4KNRLLQRqVTYtYC29xuhYTJoYPWdzvKp"), - ]; - - // hopefully this way the errors will be easier to hunt down - - actual.reverse(); - expected.reverse(); - - while let Some(actual) = actual.pop() { - let expected = expected.pop().expect("size mismatch"); - assert_eq!(actual.0, expected.0); - assert_eq!( - actual.1.to_string(), - expected.1, - "{:?}: {:?}", - actual.0, - Hex(&actual.2) - ); - } - } - - struct Hex<'a>(&'a [u8]); - use std::fmt; - - impl<'a> fmt::Debug for Hex<'a> { - fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { - for b in self.0 { - write!(fmt, "{:02x}", b)?; - } - Ok(()) - } - } - - #[test] - fn empty_path() { - let mut builder = BufferingTreeBuilder::default(); - builder.put_file("", some_cid(0), 1).unwrap(); - - let mut full_path = String::new(); - let mut buffer = Vec::new(); - - let iter = builder.build(&mut full_path, &mut buffer); - let actual = iter - .map(|res| res.map(|(p, _, _)| p)) - .collect::, _>>() - .unwrap(); - - assert!( - actual.is_empty(), - "wrapping in directory was not asked, single element" - ); - } - - #[test] - #[should_panic] - fn rooted_path() { - let mut builder = BufferingTreeBuilder::default(); - builder.put_file("/a", some_cid(0), 1).unwrap(); - } - - #[test] - #[should_panic] - fn successive_slashes() { - let mut builder = BufferingTreeBuilder::default(); - builder.put_file("a//b", some_cid(0), 1).unwrap(); - } - - #[test] - fn multiple_roots() { - // foobar\n - let five_block_foobar = - Cid::try_from("QmRJHYTNvC3hmd9gJQARxLR1QMEincccBV53bBw524yyq6").unwrap(); - - let opts = TreeOptions::default().with_wrap_in_directory(); - let mut builder = BufferingTreeBuilder::new(opts); - builder - .put_file("a", five_block_foobar.clone(), 221) - .unwrap(); - builder.put_file("b", five_block_foobar, 221).unwrap(); - - let mut full_path = String::new(); - let mut buffer = Vec::new(); - - let iter = builder.build(&mut full_path, &mut buffer); - let actual = iter - .map(|res| res.map(|(p, _, _)| p)) - .collect::, _>>() - .unwrap(); - - assert_eq!(actual, &["QmdbWuhpVCX9weVMMqvVTMeGwKMqCNJDbx7ZK1zG36sea7"]); - } - - #[test] - #[should_panic] - fn denied_multiple_root_dirs() { - let mut builder = BufferingTreeBuilder::default(); - builder.put_file("a/c.txt", some_cid(0), 1).unwrap(); - builder.put_file("b/d.txt", some_cid(1), 1).unwrap(); - } - - #[test] - #[should_panic] - fn denied_multiple_root_files() { - let mut builder = BufferingTreeBuilder::default(); - builder.put_file("a.txt", some_cid(0), 1).unwrap(); - builder.put_file("b.txt", some_cid(1), 1).unwrap(); - } - - #[test] - #[should_panic] - fn using_leaf_as_node() { - let mut builder = BufferingTreeBuilder::default(); - builder.put_file("a.txt", some_cid(0), 1).unwrap(); - builder.put_file("a.txt/b.txt", some_cid(1), 1).unwrap(); - } - - #[test] - fn set_metadata_before_files() { - let mut builder = BufferingTreeBuilder::default(); - builder - .set_metadata("a/b/c/d", Metadata::default()) - .unwrap(); - builder.put_file("a/b/c/d/e.txt", some_cid(1), 1).unwrap(); - builder.put_file("a/b/c/d/f.txt", some_cid(2), 1).unwrap(); - - let mut full_path = String::new(); - let mut buffer = Vec::new(); - - let iter = builder.build(&mut full_path, &mut buffer); - let actual = iter - .map(|res| res.map(|(p, _, _)| p)) - .collect::, _>>() - .unwrap(); - - assert_eq!(actual, &["a/b/c/d", "a/b/c", "a/b", "a",]) - } - - #[test] - fn set_metadata_on_file() { - let mut builder = BufferingTreeBuilder::default(); - builder.put_file("a/a.txt", some_cid(0), 1).unwrap(); - let err = builder - .set_metadata("a/a.txt", Metadata::default()) - .unwrap_err(); - - assert!( - matches!(err, TreeBuildingFailed::LeafAsDirectory(_)), - "{:?}", - err - ); - } - - /// Returns a quick and dirty sha2-256 of the given number as a Cidv0 - fn some_cid(number: usize) -> Cid { - use multihash::Sha2_256; - let mh = Sha2_256::digest(&number.to_le_bytes()); - Cid::new_v0(mh).unwrap() - } -} diff --git a/unixfs/src/dir/builder/buffered.rs b/unixfs/src/dir/builder/buffered.rs new file mode 100644 index 000000000..924f6c2bc --- /dev/null +++ b/unixfs/src/dir/builder/buffered.rs @@ -0,0 +1,388 @@ +use super::{DirBuilder, Entry, Leaf, PostOrderIterator, TreeBuildingFailed, TreeOptions, Visited}; +use crate::Metadata; +use cid::Cid; +use std::collections::hash_map::Entry::*; + +/// UnixFs directory tree builder, which buffers entries until `build()` is called. +#[derive(Debug)] +pub struct BufferingTreeBuilder { + /// At the root there can be only one element, unless an option was given to create a new + /// directory surrounding the root elements. + root_builder: DirBuilder, + longest_path: usize, + // used to generate each node an unique id which is used when doing the post order traversal to + // recover all childrens rendered Cids + counter: u64, + opts: TreeOptions, +} + +impl Default for BufferingTreeBuilder { + fn default() -> Self { + Self::new(TreeOptions::default()) + } +} + +impl BufferingTreeBuilder { + /// Construct a new tree builder with the given configuration. + pub fn new(opts: TreeOptions) -> Self { + BufferingTreeBuilder { + root_builder: DirBuilder::root(0), + longest_path: 0, + counter: 1, + opts, + } + } + + /// Records the give path to be a link to the following cid. + /// + /// FIXME: this should be renamed as "put_leaf" or "put_opaque_leaf". + pub fn put_file( + &mut self, + full_path: &str, + target: Cid, + total_size: u64, + ) -> Result<(), TreeBuildingFailed> { + // inserted at the depth + let leaf = Leaf { + link: target, + total_size, + }; + + self.modify_with(full_path, |parent, basename, _| { + parent + .put_leaf(basename, leaf) + .map_err(|_| TreeBuildingFailed::DuplicatePath(full_path.to_string())) + }) + } + + /// Directories get "put" implicitly through the put files, and directories need to be adjusted + /// only when wanting them to have metadata. + pub fn set_metadata( + &mut self, + full_path: &str, + metadata: Metadata, + ) -> Result<(), TreeBuildingFailed> { + // create all paths along the way + // + // set if not set, error otherwise? FIXME: doesn't error atm + self.modify_with(full_path, |parent, basename, id| { + parent + .add_or_get_node(basename, id) + .map_err(|_| TreeBuildingFailed::LeafAsDirectory(full_path.to_string()))? + .set_metadata(metadata); + Ok(()) + }) + } + + fn modify_with(&mut self, full_path: &str, f: F) -> Result<(), TreeBuildingFailed> + where + F: FnOnce(&mut DirBuilder, String, &mut Option) -> Result<(), TreeBuildingFailed>, + { + // create all paths along the way + // + // assuming it's ok to split '/' since that cannot be escaped in linux at least + + self.longest_path = full_path.len().max(self.longest_path); + let mut remaining = full_path.split('/').enumerate().peekable(); + let mut dir_builder = &mut self.root_builder; + + // needed to avoid borrowing into the DirBuilder::new calling closure + let counter = &mut self.counter; + + while let Some((depth, next)) = remaining.next() { + let last = remaining.peek().is_none(); + + match (depth, next, last) { + // this might need to be accepted in case there is just a single file + (0, "", true) => { /* accepted */ } + (0, "", false) => { + return Err(TreeBuildingFailed::RootedPath(full_path.to_string())) + } + (_, "", false) => { + return Err(TreeBuildingFailed::RepeatSlashesInPath( + full_path.to_string(), + )) + } + (_, "", true) => todo!("path ends in slash"), + _ => {} + } + + // our first level can be full given the options + let full = depth == 0 && !self.opts.wrap_in_directory && dir_builder.is_empty(); + + if last { + let mut next_id = Some(*counter); + + let ret = if full { + Err(TreeBuildingFailed::TooManyRootLevelEntries) + } else { + f(dir_builder, next.to_string(), &mut next_id) + }; + + if next_id.is_none() { + *counter += 1; + } + + if ret.is_err() { + // FIXME: there might be a case where we have now stale nodes in our tree but + // cannot figure out an example for that. + } + + return ret; + } + + let parent_id = dir_builder.id; + + dir_builder = match (full, dir_builder.nodes.entry(next.to_string())) { + (_, Occupied(oe)) => oe + .into_mut() + .as_dir_builder() + .map_err(|_| TreeBuildingFailed::LeafAsDirectory(full_path.to_string()))?, + (false, Vacant(ve)) => { + let next_id = *counter; + *counter += 1; + ve.insert(Entry::Directory(DirBuilder::new(parent_id, next_id))) + .as_dir_builder() + .expect("safe: we just inserted a DirBuilder") + } + (true, Vacant(_)) => return Err(TreeBuildingFailed::TooManyRootLevelEntries), + }; + } + + // as the str::split will always return a single element this should not ever be hit + unreachable!( + "walked the full_path but failed to add anything: {:?}", + full_path + ); + } + + /// Called to build the tree. The built tree will have the added files and their implied + /// directory structure, along with the any directory entries which were created using + /// `set_metadata`. To build the whole hierarchy, one must iterate the returned iterator to + /// completion while storing the created blocks. + /// + /// Returned `PostOrderIterator` will use the given `full_path` and `block_buffer` to store + /// it's data during the walk. `PostOrderIterator` implements `Iterator` while also allowing + /// borrowed access via `next_borrowed`. + pub fn build<'a>( + self, + full_path: &'a mut String, + block_buffer: &'a mut Vec, + ) -> PostOrderIterator<'a> { + PostOrderIterator::new( + Visited::Descent { + node: self.root_builder, + name: None, + depth: 0, + }, + full_path, + block_buffer, + self.opts, + ) + } +} + +#[cfg(test)] +mod tests { + use super::{BufferingTreeBuilder, Metadata, TreeBuildingFailed, TreeOptions}; + use cid::Cid; + use std::convert::TryFrom; + + #[test] + fn some_directories() { + let mut builder = BufferingTreeBuilder::default(); + + // foobar\n + let five_block_foobar = + Cid::try_from("QmRJHYTNvC3hmd9gJQARxLR1QMEincccBV53bBw524yyq6").unwrap(); + + builder + .put_file("a/b/c/d/e/f/g.txt", five_block_foobar.clone(), 221) + .unwrap(); + builder + .put_file("a/b/c/d/e/h.txt", five_block_foobar.clone(), 221) + .unwrap(); + builder + .put_file("a/b/c/d/e/i.txt", five_block_foobar.clone(), 221) + .unwrap(); + + let mut full_path = String::new(); + let mut buffer = Vec::new(); + + let iter = builder.build(&mut full_path, &mut buffer); + let mut actual = iter + .map(|res| res.map(|(p, cid, buf)| (p, cid, buf))) + .collect::, _>>() + .unwrap(); + + let mut expected = vec![ + ( + "a/b/c/d/e/f", + "Qmbgf44ztW9wLcGNRNYGinGQB6SQDQtbHVbkM5MrWms698", + ), + ( + "a/b/c/d/e", + "Qma1hCr3CuPRAq2Gw4DCNMqsi42Bjs4Bt1MGSS57kNh144", + ), + ("a/b/c/d", "QmUqaYatcJqiSFdykHXGh4Nog1eMSfDJBeYzcG67KV5Ri4"), + ("a/b/c", "QmYwaNBaGpDCNN9XpHmjxVPHmEXZMw9KDY3uikE2UU5fVB"), + ("a/b", "QmeAzCPig4o4gBLh2LvP96Sr8MUBrsu2Scw9MTq1EvTDhY"), + ("a", "QmSTUFaPwJW8xD4KNRLLQRqVTYtYC29xuhYTJoYPWdzvKp"), + ]; + + // hopefully this way the errors will be easier to hunt down + + actual.reverse(); + expected.reverse(); + + while let Some(actual) = actual.pop() { + let expected = expected.pop().expect("size mismatch"); + assert_eq!(actual.0, expected.0); + assert_eq!( + actual.1.to_string(), + expected.1, + "{:?}: {:?}", + actual.0, + Hex(&actual.2) + ); + } + } + + struct Hex<'a>(&'a [u8]); + use std::fmt; + + impl<'a> fmt::Debug for Hex<'a> { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + for b in self.0 { + write!(fmt, "{:02x}", b)?; + } + Ok(()) + } + } + + #[test] + fn empty_path() { + let mut builder = BufferingTreeBuilder::default(); + builder.put_file("", some_cid(0), 1).unwrap(); + + let mut full_path = String::new(); + let mut buffer = Vec::new(); + + let iter = builder.build(&mut full_path, &mut buffer); + let actual = iter + .map(|res| res.map(|(p, _, _)| p)) + .collect::, _>>() + .unwrap(); + + assert!( + actual.is_empty(), + "wrapping in directory was not asked, single element" + ); + } + + #[test] + #[should_panic] + fn rooted_path() { + let mut builder = BufferingTreeBuilder::default(); + builder.put_file("/a", some_cid(0), 1).unwrap(); + } + + #[test] + #[should_panic] + fn successive_slashes() { + let mut builder = BufferingTreeBuilder::default(); + builder.put_file("a//b", some_cid(0), 1).unwrap(); + } + + #[test] + fn multiple_roots() { + // foobar\n + let five_block_foobar = + Cid::try_from("QmRJHYTNvC3hmd9gJQARxLR1QMEincccBV53bBw524yyq6").unwrap(); + + let opts = TreeOptions::default().with_wrap_in_directory(); + let mut builder = BufferingTreeBuilder::new(opts); + builder + .put_file("a", five_block_foobar.clone(), 221) + .unwrap(); + builder.put_file("b", five_block_foobar, 221).unwrap(); + + let mut full_path = String::new(); + let mut buffer = Vec::new(); + + let iter = builder.build(&mut full_path, &mut buffer); + let actual = iter + .map(|res| res.map(|(p, _, _)| p)) + .collect::, _>>() + .unwrap(); + + assert_eq!(actual, &["QmdbWuhpVCX9weVMMqvVTMeGwKMqCNJDbx7ZK1zG36sea7"]); + } + + #[test] + #[should_panic] + fn denied_multiple_root_dirs() { + let mut builder = BufferingTreeBuilder::default(); + builder.put_file("a/c.txt", some_cid(0), 1).unwrap(); + builder.put_file("b/d.txt", some_cid(1), 1).unwrap(); + } + + #[test] + #[should_panic] + fn denied_multiple_root_files() { + let mut builder = BufferingTreeBuilder::default(); + builder.put_file("a.txt", some_cid(0), 1).unwrap(); + builder.put_file("b.txt", some_cid(1), 1).unwrap(); + } + + #[test] + #[should_panic] + fn using_leaf_as_node() { + let mut builder = BufferingTreeBuilder::default(); + builder.put_file("a.txt", some_cid(0), 1).unwrap(); + builder.put_file("a.txt/b.txt", some_cid(1), 1).unwrap(); + } + + #[test] + fn set_metadata_before_files() { + let mut builder = BufferingTreeBuilder::default(); + builder + .set_metadata("a/b/c/d", Metadata::default()) + .unwrap(); + builder.put_file("a/b/c/d/e.txt", some_cid(1), 1).unwrap(); + builder.put_file("a/b/c/d/f.txt", some_cid(2), 1).unwrap(); + + let mut full_path = String::new(); + let mut buffer = Vec::new(); + + let iter = builder.build(&mut full_path, &mut buffer); + let actual = iter + .map(|res| res.map(|(p, _, _)| p)) + .collect::, _>>() + .unwrap(); + + assert_eq!(actual, &["a/b/c/d", "a/b/c", "a/b", "a",]) + } + + #[test] + fn set_metadata_on_file() { + let mut builder = BufferingTreeBuilder::default(); + builder.put_file("a/a.txt", some_cid(0), 1).unwrap(); + let err = builder + .set_metadata("a/a.txt", Metadata::default()) + .unwrap_err(); + + assert!( + matches!(err, TreeBuildingFailed::LeafAsDirectory(_)), + "{:?}", + err + ); + } + + /// Returns a quick and dirty sha2-256 of the given number as a Cidv0 + fn some_cid(number: usize) -> Cid { + use multihash::Sha2_256; + let mh = Sha2_256::digest(&number.to_le_bytes()); + Cid::new_v0(mh).unwrap() + } +} diff --git a/unixfs/src/dir/builder/dir_builder.rs b/unixfs/src/dir/builder/dir_builder.rs new file mode 100644 index 000000000..21c28a316 --- /dev/null +++ b/unixfs/src/dir/builder/dir_builder.rs @@ -0,0 +1,78 @@ +use super::{Entry, Leaf}; +use crate::Metadata; +use std::collections::hash_map::Entry::*; +use std::collections::HashMap; + +pub(super) struct DuplicateName; +pub(super) struct FoundLeaf; + +/// Node in a directory tree. +#[derive(Debug)] +pub(super) struct DirBuilder { + /// Immediate files, symlinks or directories in this directory + pub nodes: HashMap, + /// Metadata for this directory + metadata: Metadata, + /// Id of the parent; None for the root node + pub parent_id: Option, + /// Internal id, used for propagating Cids back from children during post order visit. + pub id: u64, +} + +impl DirBuilder { + pub fn new(parent_id: u64, id: u64) -> Self { + assert_ne!(parent_id, id); + DirBuilder { + nodes: HashMap::new(), + metadata: Default::default(), + parent_id: Some(parent_id), + id, + } + } + + pub fn root(id: u64) -> Self { + DirBuilder { + nodes: HashMap::new(), + metadata: Default::default(), + parent_id: None, + id, + } + } + + pub fn put_leaf(&mut self, key: String, leaf: Leaf) -> Result<(), DuplicateName> { + match self.nodes.entry(key) { + Occupied(_) => Err(DuplicateName), + Vacant(ve) => { + ve.insert(Entry::Leaf(leaf)); + Ok(()) + } + } + } + + pub fn add_or_get_node( + &mut self, + key: String, + id: &mut Option, + ) -> Result<&mut DirBuilder, FoundLeaf> { + match self.nodes.entry(key) { + Occupied(oe) => oe.into_mut().as_dir_builder().map_err(|_| FoundLeaf), + Vacant(ve) => { + let id = id.take().unwrap(); + let entry = ve.insert(Entry::Directory(Self::new(self.id, id))); + Ok(entry.as_dir_builder().expect("just inserted")) + } + } + } + + pub fn len(&self) -> usize { + self.nodes.len() + } + + pub fn is_empty(&self) -> bool { + self.len() != 0 + } + + pub fn set_metadata(&mut self, metadata: Metadata) { + self.metadata = metadata; + } +} diff --git a/unixfs/src/dir/builder/iter.rs b/unixfs/src/dir/builder/iter.rs new file mode 100644 index 000000000..f665f367f --- /dev/null +++ b/unixfs/src/dir/builder/iter.rs @@ -0,0 +1,285 @@ +use super::{Entry, Leaf, TreeConstructionFailed, TreeOptions, Visited}; +use cid::Cid; +use std::collections::{BTreeMap, HashMap}; +use std::fmt::{self, Write}; + +/// Constructs the directory nodes required for a tree. +/// +/// Implements the Iterator interface for owned values and the borrowed version `next_borrowed`. +/// Tree is fully constructed once this has been exhausted. +pub struct PostOrderIterator<'a> { + pub(super) full_path: &'a mut String, + pub(super) old_depth: usize, + pub(super) block_buffer: &'a mut Vec, + // our stack of pending work + pub(super) pending: Vec, + // "communication channel" from nested entries back to their parents + pub(super) persisted_cids: HashMap, BTreeMap>, + pub(super) reused_children: Vec, + pub(super) cid: Option, + pub(super) total_size: u64, + // from TreeOptions + pub(super) wrap_in_directory: bool, +} + +impl<'a> PostOrderIterator<'a> { + pub(super) fn new( + root: Visited, + full_path: &'a mut String, + block_buffer: &'a mut Vec, + opts: TreeOptions, + ) -> Self { + full_path.clear(); + block_buffer.clear(); + + PostOrderIterator { + full_path, + old_depth: 0, + block_buffer, + pending: vec![root], + persisted_cids: Default::default(), + reused_children: Vec::new(), + cid: None, + total_size: 0, + wrap_in_directory: opts.wrap_in_directory, + } + } + + fn render_directory( + links: &BTreeMap, + buffer: &mut Vec, + ) -> Result { + use crate::pb::{FlatUnixFs, PBLink, UnixFs, UnixFsType}; + use quick_protobuf::{BytesWriter, MessageWrite, Writer}; + use sha2::{Digest, Sha256}; + use std::borrow::Cow; + + // TODO: this could quite easily be made so that the links are read from the btreemap for + // calculating the size and rendering + let mut combined_from_links = 0; + + let flat = FlatUnixFs { + links: links + .iter() // .drain() would be the most reasonable + .inspect(|(_, Leaf { total_size, .. })| combined_from_links += total_size) + .map(|(name, Leaf { link, total_size })| PBLink { + Hash: Some(link.to_bytes().into()), + Name: Some(Cow::Borrowed(name.as_str())), + Tsize: Some(*total_size), + }) + .collect::>(), + data: UnixFs { + Type: UnixFsType::Directory, + Data: None, + ..Default::default() + }, + }; + + let size = flat.get_size(); + + // FIXME: we shouldn't be creating too large structures (bitswap block size limit!) + // FIXME: changing this to autosharding is going to take some thinking + + buffer.clear(); + let cap = buffer.capacity(); + + if let Some(additional) = size.checked_sub(cap) { + buffer.reserve(additional); + } + + // TODO: this could be done more integelligently; for example, we could just zero extend + // on reserving, then just truncate or somehow carry around the real length of the buffer + // to avoid truncating and zero extending. + buffer.extend(std::iter::repeat(0).take(size)); + + let mut writer = Writer::new(BytesWriter::new(&mut buffer[..])); + flat.write_message(&mut writer) + .expect("unsure how this could fail"); + let mh = multihash::wrap(multihash::Code::Sha2_256, &Sha256::digest(&buffer)); + let cid = Cid::new_v0(mh).expect("sha2_256 is the correct multihash for cidv0"); + + Ok(Leaf { + link: cid, + total_size: buffer.len() as u64 + combined_from_links, + }) + } + + /// Construct the next dag-pb node, if any. + pub fn next_borrowed<'b>( + &'b mut self, + ) -> Option> { + while let Some(visited) = self.pending.pop() { + let (name, depth) = match &visited { + Visited::Descent { name, depth, .. } => (name.as_deref(), *depth), + Visited::Post { name, depth, .. } => (name.as_deref(), *depth), + }; + + update_full_path((self.full_path, &mut self.old_depth), name, depth); + + match visited { + Visited::Descent { node, name, depth } => { + let mut leaves = Vec::new(); + + let children = &mut self.reused_children; + + for (k, v) in node.nodes { + match v { + Entry::Directory(node) => children.push(Visited::Descent { + node, + name: Some(k), + depth: depth + 1, + }), + Entry::Leaf(leaf) => leaves.push((k, leaf)), + } + } + + self.pending.push(Visited::Post { + parent_id: node.parent_id, + id: node.id, + name, + depth, + leaves, + }); + + let any_children = !children.is_empty(); + + self.pending.extend(children.drain(..)); + + if any_children { + // we could strive to do everything right now but pushing and popping might + // turn out easier code wise, or in other words, when there are no child_nodes + // we wouldn't need to go through Visited::Post. + } + } + Visited::Post { + parent_id, + id, + name, + leaves, + .. + } => { + // all of our children have now been visited; we should be able to find their + // Cids in the btreemap + let mut collected = self.persisted_cids.remove(&Some(id)).unwrap_or_default(); + + // FIXME: leaves could be drained and reused + collected.extend(leaves); + + if !self.wrap_in_directory && parent_id.is_none() { + // we aren't supposed to wrap_in_directory, and we are now looking at the + // possibly to be generated root directory. + + assert_eq!( + collected.len(), + 1, + "should not have gone this far with multiple added roots" + ); + + return None; + } + + // render unixfs, maybe return it? + let buffer = &mut self.block_buffer; + buffer.clear(); + + let leaf = match Self::render_directory(&collected, buffer) { + Ok(leaf) => leaf, + Err(e) => return Some(Err(e)), + }; + + self.cid = Some(leaf.link.clone()); + self.total_size = leaf.total_size; + + // this reuse strategy is probably good enough + collected.clear(); + + if let Some(name) = name { + // name is none only for the wrap_in_directory, which cannot really be + // propagated up but still the parent_id is allowed to be None + let previous = self + .persisted_cids + .entry(parent_id) + .or_insert(collected) + .insert(name, leaf); + + assert!(previous.is_none()); + } + + if parent_id.is_none() { + // rewrite the full_path for the wrap_in_directory + assert!( + self.full_path.is_empty(), + "full_path should had been empty but it was not: {:?}", + self.full_path + ); + // at the wrap_in_directory level the name should be the root level Cid + write!(self.full_path, "{}", self.cid.as_ref().unwrap()).unwrap(); + self.old_depth += 1; + } + + return Some(Ok(( + self.full_path.as_str(), + self.cid.as_ref().unwrap(), + self.total_size, + &self.block_buffer, + ))); + } + } + } + None + } +} + +impl<'a> Iterator for PostOrderIterator<'a> { + type Item = Result<(String, Cid, Box<[u8]>), TreeConstructionFailed>; + + fn next(&mut self) -> Option { + self.next_borrowed().map(|res| { + res.map(|(full_path, cid, _, block)| { + (full_path.to_string(), cid.to_owned(), block.into()) + }) + }) + } +} + +fn update_full_path( + (full_path, old_depth): (&mut String, &mut usize), + name: Option<&str>, + depth: usize, +) { + if depth < 2 { + // initially thought it might be good idea to add slash to all components; removing it made + // it impossible to get back down to empty string, so fixing this for depths 0 and 1. + full_path.clear(); + *old_depth = 0; + } else { + while *old_depth >= depth && *old_depth > 0 { + // we now want to pop the last segment + // this would be easier with pathbuf + let slash_at = full_path.bytes().rposition(|ch| ch == b'/'); + if let Some(slash_at) = slash_at { + full_path.truncate(slash_at); + *old_depth -= 1; + } else { + todo!( + "no last slash_at in {:?} yet {} >= {}", + full_path, + old_depth, + depth + ); + } + } + } + + debug_assert!(*old_depth <= depth); + + if let Some(name) = name { + if !full_path.is_empty() { + full_path.push_str("/"); + } + full_path.push_str(name); + *old_depth += 1; + } + + assert_eq!(*old_depth, depth); +} From a5b52b3682fb9a68a86e3cffc1d85839060e9d31 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 5 Aug 2020 22:25:16 +0300 Subject: [PATCH 10/57] doc: add initial documentation --- unixfs/src/dir.rs | 1 + unixfs/src/dir/builder.rs | 11 ++++++++--- unixfs/src/dir/builder/iter.rs | 2 +- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/unixfs/src/dir.rs b/unixfs/src/dir.rs index 218106ca4..caf69b344 100644 --- a/unixfs/src/dir.rs +++ b/unixfs/src/dir.rs @@ -10,6 +10,7 @@ pub use sharded_lookup::{Cache, LookupError, ShardError, ShardedLookup}; mod directory; pub(crate) use directory::{check_directory_supported, UnexpectedDirectoryProperties}; +/// Directory tree builder. pub mod builder; pub(crate) fn check_hamtshard_supported( diff --git a/unixfs/src/dir/builder.rs b/unixfs/src/dir/builder.rs index 662a07adb..1ad73d3bb 100644 --- a/unixfs/src/dir/builder.rs +++ b/unixfs/src/dir/builder.rs @@ -1,10 +1,8 @@ -use crate::Metadata; use cid::Cid; -use std::collections::{BTreeMap, HashMap}; use std::fmt; mod dir_builder; -use dir_builder::{DirBuilder, DuplicateName, FoundLeaf}; +use dir_builder::DirBuilder; mod iter; pub use iter::PostOrderIterator; @@ -63,12 +61,19 @@ impl TreeOptions { } } +/// Tree building failure cases. #[derive(Debug)] pub enum TreeBuildingFailed { + /// The given full path started with a slash; paths in the `/add` convention are not rooted. RootedPath(String), + /// The given full path contained empty segment. RepeatSlashesInPath(String), + /// If the `BufferingTreeBuilder` was created without `TreeOptions` with the option `wrap in + /// directory` enabled, then there can be only a single element at the root. TooManyRootLevelEntries, + /// The given full path had already been added. DuplicatePath(String), + /// The given full path had already been added as a link to an opaque entry. LeafAsDirectory(String), } diff --git a/unixfs/src/dir/builder/iter.rs b/unixfs/src/dir/builder/iter.rs index f665f367f..f93b3572d 100644 --- a/unixfs/src/dir/builder/iter.rs +++ b/unixfs/src/dir/builder/iter.rs @@ -1,7 +1,7 @@ use super::{Entry, Leaf, TreeConstructionFailed, TreeOptions, Visited}; use cid::Cid; use std::collections::{BTreeMap, HashMap}; -use std::fmt::{self, Write}; +use std::fmt::Write; /// Constructs the directory nodes required for a tree. /// From 57737b49c629a9659c00af44859b7ea150abb42a Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 5 Aug 2020 22:45:03 +0300 Subject: [PATCH 11/57] refactor: detupleify into (Owned)?TreeNode --- http/src/v0/root_files/add.rs | 6 +-- unixfs/src/dir/builder.rs | 2 +- unixfs/src/dir/builder/buffered.rs | 12 +++--- unixfs/src/dir/builder/iter.rs | 63 +++++++++++++++++++++++------- 4 files changed, 59 insertions(+), 24 deletions(-) diff --git a/http/src/v0/root_files/add.rs b/http/src/v0/root_files/add.rs index 20ad6730f..148a6c6ef 100644 --- a/http/src/v0/root_files/add.rs +++ b/http/src/v0/root_files/add.rs @@ -4,7 +4,7 @@ use bytes::{buf::BufMutExt, Buf, BufMut, Bytes, BytesMut}; use cid::Cid; use futures::stream::{Stream, TryStreamExt}; use ipfs::unixfs::ll::{ - dir::builder::{BufferingTreeBuilder, TreeBuildingFailed, TreeConstructionFailed}, + dir::builder::{BufferingTreeBuilder, TreeBuildingFailed, TreeConstructionFailed, TreeNode}, file::adder::FileAdder, }; use ipfs::{Block, Ipfs, IpfsTypes}; @@ -204,7 +204,7 @@ where let mut iter = tree.build(&mut full_path, &mut block_buffer); while let Some(res) = iter.next_borrowed() { - let (path, cid, total, block) = res.map_err(AddError::TreeBuilding)?; + let TreeNode { path, cid, total_size, block } = res.map_err(AddError::TreeBuilding)?; // shame we need to allocate once again here.. ipfs.put_block(Block { cid: cid.to_owned(), data: block.into() }).await.map_err(AddError::Persisting)?; @@ -212,7 +212,7 @@ where serde_json::to_writer((&mut buffer).writer(), &Response::Added { name: Cow::Borrowed(path), hash: Quoted(cid), - size: Quoted(total), + size: Quoted(total_size), }).map_err(AddError::ResponseSerialization)?; buffer.put(&b"\r\n"[..]); diff --git a/unixfs/src/dir/builder.rs b/unixfs/src/dir/builder.rs index 1ad73d3bb..1c7d6cec0 100644 --- a/unixfs/src/dir/builder.rs +++ b/unixfs/src/dir/builder.rs @@ -5,7 +5,7 @@ mod dir_builder; use dir_builder::DirBuilder; mod iter; -pub use iter::PostOrderIterator; +pub use iter::{OwnedTreeNode, PostOrderIterator, TreeNode}; mod buffered; pub use buffered::BufferingTreeBuilder; diff --git a/unixfs/src/dir/builder/buffered.rs b/unixfs/src/dir/builder/buffered.rs index 924f6c2bc..de54b7696 100644 --- a/unixfs/src/dir/builder/buffered.rs +++ b/unixfs/src/dir/builder/buffered.rs @@ -184,7 +184,9 @@ impl BufferingTreeBuilder { #[cfg(test)] mod tests { - use super::{BufferingTreeBuilder, Metadata, TreeBuildingFailed, TreeOptions}; + use super::{ + super::OwnedTreeNode, BufferingTreeBuilder, Metadata, TreeBuildingFailed, TreeOptions, + }; use cid::Cid; use std::convert::TryFrom; @@ -211,7 +213,7 @@ mod tests { let iter = builder.build(&mut full_path, &mut buffer); let mut actual = iter - .map(|res| res.map(|(p, cid, buf)| (p, cid, buf))) + .map(|res| res.map(|n| (n.path, n.cid, n.block))) .collect::, _>>() .unwrap(); @@ -270,7 +272,7 @@ mod tests { let iter = builder.build(&mut full_path, &mut buffer); let actual = iter - .map(|res| res.map(|(p, _, _)| p)) + .map(|res| res.map(|OwnedTreeNode { path, .. }| path)) .collect::, _>>() .unwrap(); @@ -312,7 +314,7 @@ mod tests { let iter = builder.build(&mut full_path, &mut buffer); let actual = iter - .map(|res| res.map(|(p, _, _)| p)) + .map(|res| res.map(|OwnedTreeNode { path, .. }| path)) .collect::, _>>() .unwrap(); @@ -357,7 +359,7 @@ mod tests { let iter = builder.build(&mut full_path, &mut buffer); let actual = iter - .map(|res| res.map(|(p, _, _)| p)) + .map(|res| res.map(|OwnedTreeNode { path, .. }| path)) .collect::, _>>() .unwrap(); diff --git a/unixfs/src/dir/builder/iter.rs b/unixfs/src/dir/builder/iter.rs index f93b3572d..1a6f22b88 100644 --- a/unixfs/src/dir/builder/iter.rs +++ b/unixfs/src/dir/builder/iter.rs @@ -105,9 +105,9 @@ impl<'a> PostOrderIterator<'a> { } /// Construct the next dag-pb node, if any. - pub fn next_borrowed<'b>( - &'b mut self, - ) -> Option> { + /// + /// Returns a `TreeNode` of the latest constructed tree node. + pub fn next_borrowed(&mut self) -> Option, TreeConstructionFailed>> { while let Some(visited) = self.pending.pop() { let (name, depth) = match &visited { Visited::Descent { name, depth, .. } => (name.as_deref(), *depth), @@ -217,12 +217,12 @@ impl<'a> PostOrderIterator<'a> { self.old_depth += 1; } - return Some(Ok(( - self.full_path.as_str(), - self.cid.as_ref().unwrap(), - self.total_size, - &self.block_buffer, - ))); + return Some(Ok(TreeNode { + path: self.full_path.as_str(), + cid: self.cid.as_ref().unwrap(), + total_size: self.total_size, + block: &self.block_buffer, + })); } } } @@ -231,17 +231,50 @@ impl<'a> PostOrderIterator<'a> { } impl<'a> Iterator for PostOrderIterator<'a> { - type Item = Result<(String, Cid, Box<[u8]>), TreeConstructionFailed>; + type Item = Result; fn next(&mut self) -> Option { - self.next_borrowed().map(|res| { - res.map(|(full_path, cid, _, block)| { - (full_path.to_string(), cid.to_owned(), block.into()) - }) - }) + self.next_borrowed() + .map(|res| res.map(TreeNode::into_owned)) } } +/// Borrowed representation of a node in the tree. +pub struct TreeNode<'a> { + /// Full path to the node. + pub path: &'a str, + /// The Cid of the document. + pub cid: &'a Cid, + /// Cumulative total size of the subtree in bytes. + pub total_size: u64, + /// Raw dag-pb document. + pub block: &'a [u8], +} + +impl TreeNode<'_> { + /// Convert to an owned and detached representation. + pub fn into_owned(self) -> OwnedTreeNode { + OwnedTreeNode { + path: self.path.to_owned(), + cid: self.cid.to_owned(), + total_size: self.total_size, + block: self.block.into(), + } + } +} + +/// Owned representation of a node in the tree. +pub struct OwnedTreeNode { + /// Full path to the node. + pub path: String, + /// The Cid of the document. + pub cid: Cid, + /// Cumulative total size of the subtree in bytes. + pub total_size: u64, + /// Raw dag-pb document. + pub block: Box<[u8]>, +} + fn update_full_path( (full_path, old_depth): (&mut String, &mut usize), name: Option<&str>, From dd8d9d9bfe87dd69ff2ddaa0db2292c33a6ff26a Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 5 Aug 2020 22:47:37 +0300 Subject: [PATCH 12/57] conformance: enable more directory tests --- conformance/test/index.js | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/conformance/test/index.js b/conformance/test/index.js index c0f6d50ec..792ae6b86 100644 --- a/conformance/test/index.js +++ b/conformance/test/index.js @@ -76,13 +76,11 @@ tests.root.get(factory); tests.root.add(factory, { skip: [ // ordered in the order of most likely implementation + // directories: + "should wrap content in a directory", // progress: "should add a BIG Buffer with progress enabled", - // directories: - "should add a nested directory as array of tupples", "should add a nested directory as array of tupples with progress", - "should add files to a directory non sequentially", - "should wrap content in a directory", // unixfsv1.5 metadata "should add with mode as string", "should add with mode as number", From 8319e8bd97b1dece8721adea18735d07cb4f8a90 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 6 Aug 2020 10:51:58 +0300 Subject: [PATCH 13/57] chore: remove extra clone (clippy) --- unixfs/src/dir/builder/buffered.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unixfs/src/dir/builder/buffered.rs b/unixfs/src/dir/builder/buffered.rs index de54b7696..dc8247c69 100644 --- a/unixfs/src/dir/builder/buffered.rs +++ b/unixfs/src/dir/builder/buffered.rs @@ -205,7 +205,7 @@ mod tests { .put_file("a/b/c/d/e/h.txt", five_block_foobar.clone(), 221) .unwrap(); builder - .put_file("a/b/c/d/e/i.txt", five_block_foobar.clone(), 221) + .put_file("a/b/c/d/e/i.txt", five_block_foobar, 221) .unwrap(); let mut full_path = String::new(); From 4077de1a0f16174214e3ecea119b72f6f874971a Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 6 Aug 2020 11:46:08 +0300 Subject: [PATCH 14/57] fix: report adderrors so that js-ipfs-http-client errors the mechanism is to just push a new line of MessageResponse. it's a hack, but at least the js-ipfs-http-client fails. --- http/src/v0/root_files/add.rs | 56 ++++++++++++++++++++++++----------- http/src/v0/support.rs | 6 ++++ 2 files changed, 44 insertions(+), 18 deletions(-) diff --git a/http/src/v0/root_files/add.rs b/http/src/v0/root_files/add.rs index 148a6c6ef..280ba0077 100644 --- a/http/src/v0/root_files/add.rs +++ b/http/src/v0/root_files/add.rs @@ -1,8 +1,11 @@ use super::AddArgs; use crate::v0::support::StringError; -use bytes::{buf::BufMutExt, Buf, BufMut, Bytes, BytesMut}; +use bytes::{ + buf::{BufExt, BufMutExt}, + Buf, BufMut, Bytes, BytesMut, +}; use cid::Cid; -use futures::stream::{Stream, TryStreamExt}; +use futures::stream::{Stream, StreamExt, TryStreamExt}; use ipfs::unixfs::ll::{ dir::builder::{BufferingTreeBuilder, TreeBuildingFailed, TreeConstructionFailed, TreeNode}, file::adder::FileAdder, @@ -26,23 +29,30 @@ pub(super) async fn add_inner( .map(|v| v.to_string()) .ok_or_else(|| StringError::from("missing 'boundary' on content-type"))?; - let stream = MultipartStream::new(Bytes::from(boundary), body.map_ok(|mut buf| buf.to_bytes())); + let st = MultipartStream::new(Bytes::from(boundary), body.map_ok(|mut buf| buf.to_bytes())); - // Stream> - // - // refine it to - // - // Stream> - // | | - // | convert rejection and stop the stream? - // | | - // | / - // Stream, impl std::error::Error + Send + Sync + 'static>> + let st = add_stream(ipfs, st); - let st = add_stream(ipfs, stream); + // map the errors into json objects at least as we cannot return them as trailers (yet) - // TODO: we could map the errors into json objects at least? (as we cannot return them as - // trailers) + let st = st.map(|res| match res { + passthrough @ Ok(_) | passthrough @ Err(AddError::ResponseSerialization(_)) => { + // there is nothing we should do or could do for these; the assumption is that hyper + // will send the bytes and stop on serialization error and log it. the response + // *should* be closed on the error. + passthrough + } + Err(something_else) => { + let msg = crate::v0::support::MessageResponseBuilder::default() + .with_message(format!("{}", something_else)); + let bytes: Bytes = serde_json::to_vec(&msg) + .expect("serializing here should not have failed") + .into(); + let crlf = Bytes::from(&b"\r\n"[..]); + // note that here we are assuming that the stream ends on error + Ok(bytes.chain(crlf).to_bytes()) + } + }); let body = crate::v0::support::StreamResponse(st); @@ -70,8 +80,18 @@ impl From for AddError { impl fmt::Display for AddError { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { - // TODO - write!(fmt, "{:?}", self) + use AddError::*; + match self { + Parsing(me) => write!(fmt, "invalid request body: {}", me), + Header(me) => write!(fmt, "invalid multipart header(s): {}", me), + InvalidFilename(e) => write!(fmt, "invalid multipart filename: {:?}", e), + UnsupportedField(name) => write!(fmt, "unsupported field name: {:?}", name), + UnsupportedContentType(t) => write!(fmt, "unsupported content-type: {:?} (supported: application/{{octet-stream,x-directory}})", t), + ResponseSerialization(e) => write!(fmt, "progress serialization failed: {}", e), + Persisting(e) => write!(fmt, "put_block failed: {}", e), + TreeGathering(g) => write!(fmt, "invalid directory tree: {}", g), + TreeBuilding(b) => write!(fmt, "constructed invalid directory tree: {}", b), + } } } diff --git a/http/src/v0/support.rs b/http/src/v0/support.rs index 064dd2055..d0c54bc4d 100644 --- a/http/src/v0/support.rs +++ b/http/src/v0/support.rs @@ -53,6 +53,12 @@ impl MessageKind { #[derive(Debug, Clone)] pub struct MessageResponseBuilder(MessageKind, usize); +impl Default for MessageResponseBuilder { + fn default() -> Self { + MessageResponseBuilder(MessageKind::Error, 0) + } +} + impl MessageResponseBuilder { pub fn with_message>>(self, message: S) -> MessageResponse { let Self(kind, code) = self; From 65acedae1d501df32b4d93f28b0b928b1be6890b Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 6 Aug 2020 13:14:46 +0300 Subject: [PATCH 15/57] fix: normalize prefix slash away to enable more tests --- conformance/test/index.js | 6 ------ http/src/v0/root_files/add.rs | 12 +++++++++++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/conformance/test/index.js b/conformance/test/index.js index 792ae6b86..bef988494 100644 --- a/conformance/test/index.js +++ b/conformance/test/index.js @@ -88,12 +88,6 @@ tests.root.add(factory, { "should add with mtime as { nsecs, secs }", "should add with mtime as timespec", "should add with mtime as hrtime", - // filesystem (maybe) - "should add a directory from the file system", - "should add a directory from the file system with an odd name", - "should ignore a directory from the file system", - "should add a file from the file system", - "should add a hidden file in a directory from the file system", // raw leaves "should respect raw leaves when file is smaller than one block and no metadata is present", "should override raw leaves when file is smaller than one block and metadata is present", diff --git a/http/src/v0/root_files/add.rs b/http/src/v0/root_files/add.rs index 280ba0077..93a7d0c47 100644 --- a/http/src/v0/root_files/add.rs +++ b/http/src/v0/root_files/add.rs @@ -119,9 +119,19 @@ where let filename = field.filename().map_err(AddError::Header)?; let filename = percent_encoding::percent_decode_str(filename) .decode_utf8() - .map(|cow| cow.into_owned()) .map_err(AddError::InvalidFilename)?; + let filename = if filename.starts_with('/') { + // normalize single first slash; seems similar to what js-ipfs does: filesystem + // test cases post with paths '/some-directory/...' and others post with + // 'some-directory/...'. + + // since slash is single code point we can just + filename[1..].to_owned() + } else { + filename.into_owned() + }; + let content_type = field.content_type().map_err(AddError::Header)?; let next = match content_type { From 14112c3dca9eed3e48cb6b0a8a69c61f648409b7 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 6 Aug 2020 13:32:00 +0300 Subject: [PATCH 16/57] feat: /add?wrap_with_directory=true previous implementation included "defaulting the name for the wrapper dir" which was extra and not expected by the interface tests. --- conformance/test/index.js | 2 -- http/src/v0/root_files.rs | 5 +++- http/src/v0/root_files/add.rs | 20 ++++++++++++---- unixfs/src/dir/builder/buffered.rs | 38 ++++++++++++++++++++++++++++-- unixfs/src/dir/builder/iter.rs | 13 ---------- 5 files changed, 55 insertions(+), 23 deletions(-) diff --git a/conformance/test/index.js b/conformance/test/index.js index bef988494..8e55d4ca5 100644 --- a/conformance/test/index.js +++ b/conformance/test/index.js @@ -76,8 +76,6 @@ tests.root.get(factory); tests.root.add(factory, { skip: [ // ordered in the order of most likely implementation - // directories: - "should wrap content in a directory", // progress: "should add a BIG Buffer with progress enabled", "should add a nested directory as array of tupples with progress", diff --git a/http/src/v0/root_files.rs b/http/src/v0/root_files.rs index 5bbadf99d..ac85c4233 100644 --- a/http/src/v0/root_files.rs +++ b/http/src/v0/root_files.rs @@ -24,11 +24,14 @@ mod add; #[derive(Debug, Deserialize)] pub struct AddArgs { // probably never interesting - #[serde(default)] + #[serde(default, rename = "stream-channels")] stream_channels: bool, // unsure what this does #[serde(default)] progress: bool, + /// When true, a new directory is created to hold more than 1 root level directories. + #[serde(default, rename = "wrap-with-directory")] + wrap_with_directory: bool, } pub fn add( diff --git a/http/src/v0/root_files/add.rs b/http/src/v0/root_files/add.rs index 93a7d0c47..d4317e33f 100644 --- a/http/src/v0/root_files/add.rs +++ b/http/src/v0/root_files/add.rs @@ -7,7 +7,9 @@ use bytes::{ use cid::Cid; use futures::stream::{Stream, StreamExt, TryStreamExt}; use ipfs::unixfs::ll::{ - dir::builder::{BufferingTreeBuilder, TreeBuildingFailed, TreeConstructionFailed, TreeNode}, + dir::builder::{ + BufferingTreeBuilder, TreeBuildingFailed, TreeConstructionFailed, TreeNode, TreeOptions, + }, file::adder::FileAdder, }; use ipfs::{Block, Ipfs, IpfsTypes}; @@ -20,7 +22,7 @@ use warp::{Rejection, Reply}; pub(super) async fn add_inner( ipfs: Ipfs, - _opts: AddArgs, + opts: AddArgs, content_type: Mime, body: impl Stream> + Send + Unpin + 'static, ) -> Result { @@ -31,7 +33,7 @@ pub(super) async fn add_inner( let st = MultipartStream::new(Bytes::from(boundary), body.map_ok(|mut buf| buf.to_bytes())); - let st = add_stream(ipfs, st); + let st = add_stream(ipfs, st, opts); // map the errors into json objects at least as we cannot return them as trailers (yet) @@ -100,14 +102,22 @@ impl std::error::Error for AddError {} fn add_stream( ipfs: Ipfs, mut fields: MultipartStream, + opts: AddArgs, ) -> impl Stream> + Send + 'static where St: Stream> + Send + Unpin + 'static, E: Into + Send + 'static, { async_stream::try_stream! { - // TODO: wrap-in-directory option - let mut tree = BufferingTreeBuilder::default(); + + let tree_opts = TreeOptions::default(); + let tree_opts = if opts.wrap_with_directory { + tree_opts.with_wrap_in_directory() + } else { + tree_opts + }; + + let mut tree = BufferingTreeBuilder::new(tree_opts); let mut buffer = BytesMut::new(); while let Some(mut field) = fields diff --git a/unixfs/src/dir/builder/buffered.rs b/unixfs/src/dir/builder/buffered.rs index dc8247c69..823af1c70 100644 --- a/unixfs/src/dir/builder/buffered.rs +++ b/unixfs/src/dir/builder/buffered.rs @@ -314,11 +314,45 @@ mod tests { let iter = builder.build(&mut full_path, &mut buffer); let actual = iter - .map(|res| res.map(|OwnedTreeNode { path, .. }| path)) + .map(|res| res.map(|OwnedTreeNode { path, cid, .. }| (path, cid.to_string()))) .collect::, _>>() .unwrap(); - assert_eq!(actual, &["QmdbWuhpVCX9weVMMqvVTMeGwKMqCNJDbx7ZK1zG36sea7"]); + assert_eq!( + actual, + &[( + "".to_string(), + "QmdbWuhpVCX9weVMMqvVTMeGwKMqCNJDbx7ZK1zG36sea7".to_string() + )] + ); + } + + #[test] + fn single_wrapped_root() { + // foobar\n + let five_block_foobar = + Cid::try_from("QmRJHYTNvC3hmd9gJQARxLR1QMEincccBV53bBw524yyq6").unwrap(); + + let opts = TreeOptions::default().with_wrap_in_directory(); + let mut builder = BufferingTreeBuilder::new(opts); + builder.put_file("a", five_block_foobar, 221).unwrap(); + + let mut full_path = String::new(); + let mut buffer = Vec::new(); + + let iter = builder.build(&mut full_path, &mut buffer); + let actual = iter + .map(|res| res.map(|OwnedTreeNode { path, cid, .. }| (path, cid.to_string()))) + .collect::, _>>() + .unwrap(); + + assert_eq!( + actual, + &[( + "".to_string(), + "QmQBseoi3b2FBrYhjM2E4mCF4Q7C8MgCUbzAbGNfyVwgNk".to_string() + )] + ); } #[test] diff --git a/unixfs/src/dir/builder/iter.rs b/unixfs/src/dir/builder/iter.rs index 1a6f22b88..819b7eecf 100644 --- a/unixfs/src/dir/builder/iter.rs +++ b/unixfs/src/dir/builder/iter.rs @@ -1,7 +1,6 @@ use super::{Entry, Leaf, TreeConstructionFailed, TreeOptions, Visited}; use cid::Cid; use std::collections::{BTreeMap, HashMap}; -use std::fmt::Write; /// Constructs the directory nodes required for a tree. /// @@ -205,18 +204,6 @@ impl<'a> PostOrderIterator<'a> { assert!(previous.is_none()); } - if parent_id.is_none() { - // rewrite the full_path for the wrap_in_directory - assert!( - self.full_path.is_empty(), - "full_path should had been empty but it was not: {:?}", - self.full_path - ); - // at the wrap_in_directory level the name should be the root level Cid - write!(self.full_path, "{}", self.cid.as_ref().unwrap()).unwrap(); - self.old_depth += 1; - } - return Some(Ok(TreeNode { path: self.full_path.as_str(), cid: self.cid.as_ref().unwrap(), From 5534fef2b8d3d49e5b8cefe60f1f25e234aae087 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 6 Aug 2020 13:38:12 +0300 Subject: [PATCH 17/57] refactor: wrap_in_directory => wrap_with_directory just aligning the terminology and trying not to invent new terms. --- http/src/v0/root_files/add.rs | 10 ++++------ unixfs/src/dir/builder.rs | 14 +++++++------- unixfs/src/dir/builder/buffered.rs | 8 +++++--- unixfs/src/dir/builder/iter.rs | 10 +++++----- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/http/src/v0/root_files/add.rs b/http/src/v0/root_files/add.rs index d4317e33f..5ea8dd48d 100644 --- a/http/src/v0/root_files/add.rs +++ b/http/src/v0/root_files/add.rs @@ -110,12 +110,10 @@ where { async_stream::try_stream! { - let tree_opts = TreeOptions::default(); - let tree_opts = if opts.wrap_with_directory { - tree_opts.with_wrap_in_directory() - } else { - tree_opts - }; + let mut tree_opts = TreeOptions::default(); + if opts.wrap_with_directory { + tree_opts.wrap_with_directory(); + } let mut tree = BufferingTreeBuilder::new(tree_opts); let mut buffer = BytesMut::new(); diff --git a/unixfs/src/dir/builder.rs b/unixfs/src/dir/builder.rs index 1c7d6cec0..64ac93f81 100644 --- a/unixfs/src/dir/builder.rs +++ b/unixfs/src/dir/builder.rs @@ -48,16 +48,16 @@ impl fmt::Debug for Leaf { } /// Configuration for customizing how the tree is built. -#[derive(Default, Debug)] +#[derive(Default, Debug, Clone)] pub struct TreeOptions { - wrap_in_directory: bool, + wrap_with_directory: bool, } impl TreeOptions { - /// When true, allow multiple top level entries, otherwise error on the second entry - pub fn with_wrap_in_directory(mut self) -> TreeOptions { - self.wrap_in_directory = true; - self + /// When true, allow multiple top level entries, otherwise error on the second entry. + /// Defaults to false. + pub fn wrap_with_directory(&mut self) { + self.wrap_with_directory = true; } } @@ -86,7 +86,7 @@ impl fmt::Display for TreeBuildingFailed { RepeatSlashesInPath(s) => write!(fmt, "path contains repeat slashes: {:?}", s), TooManyRootLevelEntries => write!( fmt, - "multiple root level entries while configured wrap_in_directory = false" + "multiple root level entries while configured wrap_with_directory = false" ), // TODO: perhaps we should allow adding two leafs with same Cid? DuplicatePath(s) => write!(fmt, "path exists already: {:?}", s), diff --git a/unixfs/src/dir/builder/buffered.rs b/unixfs/src/dir/builder/buffered.rs index 823af1c70..784130715 100644 --- a/unixfs/src/dir/builder/buffered.rs +++ b/unixfs/src/dir/builder/buffered.rs @@ -108,7 +108,7 @@ impl BufferingTreeBuilder { } // our first level can be full given the options - let full = depth == 0 && !self.opts.wrap_in_directory && dir_builder.is_empty(); + let full = depth == 0 && !self.opts.wrap_with_directory && dir_builder.is_empty(); if last { let mut next_id = Some(*counter); @@ -302,7 +302,8 @@ mod tests { let five_block_foobar = Cid::try_from("QmRJHYTNvC3hmd9gJQARxLR1QMEincccBV53bBw524yyq6").unwrap(); - let opts = TreeOptions::default().with_wrap_in_directory(); + let mut opts = TreeOptions::default(); + opts.wrap_with_directory(); let mut builder = BufferingTreeBuilder::new(opts); builder .put_file("a", five_block_foobar.clone(), 221) @@ -333,7 +334,8 @@ mod tests { let five_block_foobar = Cid::try_from("QmRJHYTNvC3hmd9gJQARxLR1QMEincccBV53bBw524yyq6").unwrap(); - let opts = TreeOptions::default().with_wrap_in_directory(); + let mut opts = TreeOptions::default(); + opts.wrap_with_directory(); let mut builder = BufferingTreeBuilder::new(opts); builder.put_file("a", five_block_foobar, 221).unwrap(); diff --git a/unixfs/src/dir/builder/iter.rs b/unixfs/src/dir/builder/iter.rs index 819b7eecf..a574e11bc 100644 --- a/unixfs/src/dir/builder/iter.rs +++ b/unixfs/src/dir/builder/iter.rs @@ -18,7 +18,7 @@ pub struct PostOrderIterator<'a> { pub(super) cid: Option, pub(super) total_size: u64, // from TreeOptions - pub(super) wrap_in_directory: bool, + pub(super) wrap_with_directory: bool, } impl<'a> PostOrderIterator<'a> { @@ -40,7 +40,7 @@ impl<'a> PostOrderIterator<'a> { reused_children: Vec::new(), cid: None, total_size: 0, - wrap_in_directory: opts.wrap_in_directory, + wrap_with_directory: opts.wrap_with_directory, } } @@ -164,8 +164,8 @@ impl<'a> PostOrderIterator<'a> { // FIXME: leaves could be drained and reused collected.extend(leaves); - if !self.wrap_in_directory && parent_id.is_none() { - // we aren't supposed to wrap_in_directory, and we are now looking at the + if !self.wrap_with_directory && parent_id.is_none() { + // we aren't supposed to wrap_with_directory, and we are now looking at the // possibly to be generated root directory. assert_eq!( @@ -193,7 +193,7 @@ impl<'a> PostOrderIterator<'a> { collected.clear(); if let Some(name) = name { - // name is none only for the wrap_in_directory, which cannot really be + // name is none only for the wrap_with_directory, which cannot really be // propagated up but still the parent_id is allowed to be None let previous = self .persisted_cids From dc2c0217b99d5848f419e6e3d55953113452d570 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 6 Aug 2020 14:00:35 +0300 Subject: [PATCH 18/57] feat: /add?progress=true adds the progress reporting, which is a progress notification to the response stream on every block write *in addition to* a notification *after* the file has been read (order with the `Added` message doesn't seem to have been specified). --- conformance/test/index.js | 3 -- http/src/v0/root_files/add.rs | 54 +++++++++++++++++++++++++++++------ 2 files changed, 46 insertions(+), 11 deletions(-) diff --git a/conformance/test/index.js b/conformance/test/index.js index 8e55d4ca5..d884bed89 100644 --- a/conformance/test/index.js +++ b/conformance/test/index.js @@ -76,9 +76,6 @@ tests.root.get(factory); tests.root.add(factory, { skip: [ // ordered in the order of most likely implementation - // progress: - "should add a BIG Buffer with progress enabled", - "should add a nested directory as array of tupples with progress", // unixfsv1.5 metadata "should add with mode as string", "should add with mode as number", diff --git a/http/src/v0/root_files/add.rs b/http/src/v0/root_files/add.rs index 5ea8dd48d..ebe98316a 100644 --- a/http/src/v0/root_files/add.rs +++ b/http/src/v0/root_files/add.rs @@ -153,7 +153,10 @@ where }?; let mut adder = FileAdder::default(); - let mut total = 0u64; + // how much of bytes have we stored as blocks + let mut total_written = 0u64; + // how much of bytes have we read of input + let mut total_read = 0u64; loop { let next = field @@ -164,31 +167,66 @@ where match next { Some(next) => { let mut read = 0usize; + let mut saved_any = false; + while read < next.len() { let (iter, used) = adder.push(&next.slice(read..)); read += used; let maybe_tuple = import_all(&ipfs, iter).await.map_err(AddError::Persisting)?; - total += maybe_tuple.map(|t| t.1).unwrap_or(0); + let subtotal = maybe_tuple.map(|t| t.1); + + total_written += subtotal.unwrap_or(0); + + saved_any |= subtotal.is_some(); + } + + total_read += read as u64; + + if saved_any && opts.progress { + // technically we could just send messages but that'd + // need us to let go using Cow's or use Arc or + // similar. not especially fond of either. + serde_json::to_writer((&mut buffer).writer(), &Response::Progress { + name: Cow::Borrowed(&filename), + bytes: total_read, + }).map_err(AddError::ResponseSerialization)?; + + buffer.put(&b"\r\n"[..]); + yield buffer.split().freeze(); } } None => break, } } + if opts.progress { + // in the interface-http-core tests the subtotal is expected to be full + // size, ordering w.r.t. to the "added" is not specified + serde_json::to_writer((&mut buffer).writer(), &Response::Progress { + name: Cow::Borrowed(&filename), + bytes: total_read, + }).map_err(AddError::ResponseSerialization)?; + + buffer.put(&b"\r\n"[..]); + + // it is not required to yield here so perhaps we just accumulate the next + // response in as well + } + let (root, subtotal) = import_all(&ipfs, adder.finish()) .await .map_err(AddError::Persisting)? .expect("I think there should always be something from finish -- except if the link block has just been compressed?"); - total += subtotal; + total_written += subtotal; - tracing::trace!("completed processing file of {} bytes: {:?}", total, filename); + tracing::trace!("completed processing file of {} bytes: {:?}", total_read, filename); // using the filename as the path since we can tolerate a single empty named file // however the second one will cause issues - tree.put_file(&filename, root.clone(), total) + tree.put_file(&filename, root.clone(), total_written) .map_err(AddError::TreeGathering)?; let filename: Cow<'_, str> = if filename.is_empty() { @@ -202,7 +240,7 @@ where serde_json::to_writer((&mut buffer).writer(), &Response::Added { name: filename, hash: Quoted(&root), - size: Quoted(total), + size: Quoted(total_written), }).map_err(AddError::ResponseSerialization)?; buffer.put(&b"\r\n"[..]); @@ -289,12 +327,12 @@ async fn import_all( enum Response<'a> { /// When progress=true query parameter has been given, this will be output every N bytes, or /// perhaps every chunk. - #[allow(unused)] // unused == not implemented yet Progress { /// Probably the name of the file being added or empty if none was provided. name: Cow<'a, str>, /// Bytes processed since last progress; for a file, all progress reports must add up to - /// the total file size. + /// the total file size. Interestingly this should not be stringified with `Quoted`, + /// whereas the `Added::size` needs to be `Quoted`. bytes: u64, }, /// Output for every input item. From a90826b48df2687ab307dbfc362dc8854b0f38fa Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 6 Aug 2020 14:27:39 +0300 Subject: [PATCH 19/57] perf: avoid clearing block_buffer this should lead to less zero extending but it's questionable if this is more performant. --- unixfs/src/dir/builder/iter.rs | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/unixfs/src/dir/builder/iter.rs b/unixfs/src/dir/builder/iter.rs index a574e11bc..2813e0202 100644 --- a/unixfs/src/dir/builder/iter.rs +++ b/unixfs/src/dir/builder/iter.rs @@ -29,7 +29,6 @@ impl<'a> PostOrderIterator<'a> { opts: TreeOptions, ) -> Self { full_path.clear(); - block_buffer.clear(); PostOrderIterator { full_path, @@ -79,21 +78,22 @@ impl<'a> PostOrderIterator<'a> { // FIXME: we shouldn't be creating too large structures (bitswap block size limit!) // FIXME: changing this to autosharding is going to take some thinking - buffer.clear(); let cap = buffer.capacity(); if let Some(additional) = size.checked_sub(cap) { buffer.reserve(additional); } - // TODO: this could be done more integelligently; for example, we could just zero extend - // on reserving, then just truncate or somehow carry around the real length of the buffer - // to avoid truncating and zero extending. - buffer.extend(std::iter::repeat(0).take(size)); + if let Some(needed_zeroes) = size.checked_sub(buffer.len()) { + buffer.extend(std::iter::repeat(0).take(needed_zeroes)); + } let mut writer = Writer::new(BytesWriter::new(&mut buffer[..])); flat.write_message(&mut writer) .expect("unsure how this could fail"); + + buffer.truncate(size); + let mh = multihash::wrap(multihash::Code::Sha2_256, &Sha256::digest(&buffer)); let cid = Cid::new_v0(mh).expect("sha2_256 is the correct multihash for cidv0"); @@ -177,9 +177,7 @@ impl<'a> PostOrderIterator<'a> { return None; } - // render unixfs, maybe return it? let buffer = &mut self.block_buffer; - buffer.clear(); let leaf = match Self::render_directory(&collected, buffer) { Ok(leaf) => leaf, From 653d079cc9228f92fc5c66320ee8af4f43b21462 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 6 Aug 2020 14:28:36 +0300 Subject: [PATCH 20/57] doc: add note on HAMT sharding, adjust --- unixfs/src/dir/builder/iter.rs | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/unixfs/src/dir/builder/iter.rs b/unixfs/src/dir/builder/iter.rs index 2813e0202..906392a0b 100644 --- a/unixfs/src/dir/builder/iter.rs +++ b/unixfs/src/dir/builder/iter.rs @@ -52,10 +52,25 @@ impl<'a> PostOrderIterator<'a> { use sha2::{Digest, Sha256}; use std::borrow::Cow; - // TODO: this could quite easily be made so that the links are read from the btreemap for - // calculating the size and rendering + // FIXME: ideas on how to turn this into a HAMT sharding on some heuristic. we probably + // need to introduce states in to the "iterator": + // + // 1. bucketization + // 2. another post order visit of the buckets? + // + // the nested post order visit should probably re-use the existing infra ("message + // passing") and new ids can be generated by giving this iterator the counter from + // BufferedTreeWriter. + // + // could also be that the HAMT shard building should start earlier, since the same + // heuristic can be detected *at* bufferedtreewriter. there the split would be easier, and + // this would "just" be a single node rendering, and not need any additional states.. + + // track the combined size which we'll report to our parent let mut combined_from_links = 0; + // TODO: this could quite easily be made so that the links are read from the btreemap for + // calculating the size and rendering let flat = FlatUnixFs { links: links .iter() // .drain() would be the most reasonable From 3e499e161eded93b26e05d543891c4f796e7f129 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 6 Aug 2020 15:06:22 +0300 Subject: [PATCH 21/57] perf: custom dag-pb serialization use the BTreeMap directly to output the PBLink alike bytes without going through the mapping into Vec. Does not support Cid v1 though yet. --- unixfs/src/dir/builder/iter.rs | 95 +++++++++++++++++++++++++++------- 1 file changed, 75 insertions(+), 20 deletions(-) diff --git a/unixfs/src/dir/builder/iter.rs b/unixfs/src/dir/builder/iter.rs index 906392a0b..8b4b72eae 100644 --- a/unixfs/src/dir/builder/iter.rs +++ b/unixfs/src/dir/builder/iter.rs @@ -47,10 +47,9 @@ impl<'a> PostOrderIterator<'a> { links: &BTreeMap, buffer: &mut Vec, ) -> Result { - use crate::pb::{FlatUnixFs, PBLink, UnixFs, UnixFsType}; - use quick_protobuf::{BytesWriter, MessageWrite, Writer}; + use crate::pb::{UnixFs, UnixFsType}; + use quick_protobuf::{BytesWriter, MessageWrite, Writer, WriterBackend}; use sha2::{Digest, Sha256}; - use std::borrow::Cow; // FIXME: ideas on how to turn this into a HAMT sharding on some heuristic. we probably // need to introduce states in to the "iterator": @@ -66,21 +65,71 @@ impl<'a> PostOrderIterator<'a> { // heuristic can be detected *at* bufferedtreewriter. there the split would be easier, and // this would "just" be a single node rendering, and not need any additional states.. - // track the combined size which we'll report to our parent - let mut combined_from_links = 0; - - // TODO: this could quite easily be made so that the links are read from the btreemap for - // calculating the size and rendering - let flat = FlatUnixFs { - links: links - .iter() // .drain() would be the most reasonable - .inspect(|(_, Leaf { total_size, .. })| combined_from_links += total_size) - .map(|(name, Leaf { link, total_size })| PBLink { - Hash: Some(link.to_bytes().into()), - Name: Some(Cow::Borrowed(name.as_str())), - Tsize: Some(*total_size), - }) - .collect::>(), + /// Newtype which uses the BTreeMap as Vec. + struct BTreeMappedDir<'a> { + links: &'a BTreeMap, + data: UnixFs<'a>, + } + + /// Newtype which represents an entry from BTreeMap as PBLink as far as the + /// protobuf representation goes. + struct EntryAsPBLink<'a>(&'a String, &'a Leaf); + + impl<'a> MessageWrite for EntryAsPBLink<'a> { + fn get_size(&self) -> usize { + use quick_protobuf::sizeofs::*; + + // ones are the tags + 1 + sizeof_len(self.0.len()) + + 1 + + sizeof_len(self.1.link.hash().as_bytes().len()) + + 1 + + sizeof_varint(self.1.total_size) + } + + fn write_message( + &self, + w: &mut Writer, + ) -> quick_protobuf::Result<()> { + w.write_with_tag(10, |w| w.write_bytes(self.1.link.hash().as_bytes()))?; + w.write_with_tag(18, |w| w.write_string(self.0.as_str()))?; + w.write_with_tag(24, |w| w.write_uint64(self.1.total_size))?; + Ok(()) + } + } + + impl<'a> MessageWrite for BTreeMappedDir<'a> { + fn get_size(&self) -> usize { + use quick_protobuf::sizeofs::*; + + let links = self + .links + .iter() + .inspect(|(_, Leaf { link, .. })| { + assert!( + link.version() == cid::Version::V0, + "size calc is only impl for v0 cids" + ) + }) + .map(|(k, v)| EntryAsPBLink(k, v)) + .map(|link| 1 + sizeof_len(link.get_size())) + .sum::(); + + links + 1 + sizeof_len(self.data.get_size()) + } + fn write_message( + &self, + w: &mut Writer, + ) -> quick_protobuf::Result<()> { + for l in self.links.iter().map(|(k, v)| EntryAsPBLink(k, v)) { + w.write_with_tag(18, |w| w.write_message(&l))?; + } + w.write_with_tag(10, |w| w.write_message(&self.data)) + } + } + + let btreed = BTreeMappedDir { + links, data: UnixFs { Type: UnixFsType::Directory, Data: None, @@ -88,7 +137,7 @@ impl<'a> PostOrderIterator<'a> { }, }; - let size = flat.get_size(); + let size = btreed.get_size(); // FIXME: we shouldn't be creating too large structures (bitswap block size limit!) // FIXME: changing this to autosharding is going to take some thinking @@ -104,7 +153,8 @@ impl<'a> PostOrderIterator<'a> { } let mut writer = Writer::new(BytesWriter::new(&mut buffer[..])); - flat.write_message(&mut writer) + btreed + .write_message(&mut writer) .expect("unsure how this could fail"); buffer.truncate(size); @@ -112,6 +162,11 @@ impl<'a> PostOrderIterator<'a> { let mh = multihash::wrap(multihash::Code::Sha2_256, &Sha256::digest(&buffer)); let cid = Cid::new_v0(mh).expect("sha2_256 is the correct multihash for cidv0"); + let combined_from_links = links + .values() + .map(|Leaf { total_size, .. }| total_size) + .sum::(); + Ok(Leaf { link: cid, total_size: buffer.len() as u64 + combined_from_links, From 9e948f943fcbafbd924fa973162a6a447677dcda Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 6 Aug 2020 15:46:50 +0300 Subject: [PATCH 22/57] perf: render Cids to bytes without allocating a Vec not sure how much sense does this make, given that the byte writing fns might be less inlined. would be nice to have some benchmarks. this also adds a test to verify directory cid when using cidv1 even though it's quite unclear to me if those are allowed in unixfs dirs. --- unixfs/src/dir/builder/buffered.rs | 38 +++++++++++++++++++ unixfs/src/dir/builder/iter.rs | 59 ++++++++++++++++++++++++++---- 2 files changed, 89 insertions(+), 8 deletions(-) diff --git a/unixfs/src/dir/builder/buffered.rs b/unixfs/src/dir/builder/buffered.rs index 784130715..c48e22f48 100644 --- a/unixfs/src/dir/builder/buffered.rs +++ b/unixfs/src/dir/builder/buffered.rs @@ -417,6 +417,44 @@ mod tests { ); } + #[test] + fn dir_with_cidv1_link() { + // this is `echo '{ "name": "hello" }` | ./ipfs dag put` + let target = + Cid::try_from("bafyreihakpd7te5nbmlhdk5ntvcvhf2hmfgrvcwna2sddq5zz5342mcbli").unwrap(); + + let mut builder = BufferingTreeBuilder::default(); + builder.put_file("a/b", target, 12).unwrap(); + + let mut full_path = String::new(); + let mut buffer = Vec::new(); + + let iter = builder.build(&mut full_path, &mut buffer); + let mut actual = iter + .map(|res| res.map(|n| (n.path, n.cid, n.block))) + .collect::, _>>() + .unwrap(); + + let mut expected = vec![("a", "QmPMDMPG8dbHDC9GuvqWr9pfruLnp4GZCAWrskwCmenVQa")]; + + // hopefully this way the errors will be easier to hunt down + + actual.reverse(); + expected.reverse(); + + while let Some(actual) = actual.pop() { + let expected = expected.pop().expect("size mismatch"); + assert_eq!(actual.0, expected.0); + assert_eq!( + actual.1.to_string(), + expected.1, + "{:?}: {:?}", + actual.0, + Hex(&actual.2) + ); + } + } + /// Returns a quick and dirty sha2-256 of the given number as a Cidv0 fn some_cid(number: usize) -> Cid { use multihash::Sha2_256; diff --git a/unixfs/src/dir/builder/iter.rs b/unixfs/src/dir/builder/iter.rs index 8b4b72eae..e682400f6 100644 --- a/unixfs/src/dir/builder/iter.rs +++ b/unixfs/src/dir/builder/iter.rs @@ -82,7 +82,8 @@ impl<'a> PostOrderIterator<'a> { // ones are the tags 1 + sizeof_len(self.0.len()) + 1 - + sizeof_len(self.1.link.hash().as_bytes().len()) + //+ sizeof_len(WriteableCid(&self.1.link).get_size()) + + sizeof_len(self.1.link.to_bytes().len()) + 1 + sizeof_varint(self.1.total_size) } @@ -91,13 +92,61 @@ impl<'a> PostOrderIterator<'a> { &self, w: &mut Writer, ) -> quick_protobuf::Result<()> { - w.write_with_tag(10, |w| w.write_bytes(self.1.link.hash().as_bytes()))?; + // w.write_with_tag(10, |w| w.write_message(&WriteableCid(&self.1.link)))?; + w.write_with_tag(10, |w| w.write_bytes(&self.1.link.to_bytes()))?; w.write_with_tag(18, |w| w.write_string(self.0.as_str()))?; w.write_with_tag(24, |w| w.write_uint64(self.1.total_size))?; Ok(()) } } + struct WriteableCid<'a>(&'a Cid); + + // Cid by default does not have a way to count it's length or just write it out without + // allocating a vector. + impl<'a> MessageWrite for WriteableCid<'a> { + fn get_size(&self) -> usize { + use cid::Version::*; + use quick_protobuf::sizeofs::*; + + match self.0.version() { + V0 => self.0.hash().as_bytes().len(), + V1 => { + let version_len = 1; + let codec_len = sizeof_varint(u64::from(self.0.codec())); + let hash_len = self.0.hash().as_bytes().len(); + version_len + codec_len + hash_len + } + } + } + + fn write_message( + &self, + w: &mut Writer, + ) -> quick_protobuf::Result<()> { + use cid::Version::*; + + match self.0.version() { + V0 => { + for b in self.0.hash().as_bytes() { + w.write_u8(*b)?; + } + Ok(()) + } + V1 => { + // it is possible that Cidv1 should not be linked to from a unixfs + // directory; at least go-ipfs 0.5 `ipfs files` denies making a cbor link + w.write_u8(1)?; + w.write_varint(u64::from(self.0.codec()))?; + for b in self.0.hash().as_bytes() { + w.write_u8(*b)?; + } + Ok(()) + } + } + } + } + impl<'a> MessageWrite for BTreeMappedDir<'a> { fn get_size(&self) -> usize { use quick_protobuf::sizeofs::*; @@ -105,12 +154,6 @@ impl<'a> PostOrderIterator<'a> { let links = self .links .iter() - .inspect(|(_, Leaf { link, .. })| { - assert!( - link.version() == cid::Version::V0, - "size calc is only impl for v0 cids" - ) - }) .map(|(k, v)| EntryAsPBLink(k, v)) .map(|link| 1 + sizeof_len(link.get_size())) .sum::(); From f3b92d48d663255a59dfd353bc544425c111a214 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 6 Aug 2020 17:32:25 +0300 Subject: [PATCH 23/57] fix: replace todo! with Err(PathEndsInSlash) this can leave empty nodes in the tree. --- unixfs/src/dir/builder.rs | 3 +++ unixfs/src/dir/builder/buffered.rs | 31 ++++++++++++++++++++++-------- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/unixfs/src/dir/builder.rs b/unixfs/src/dir/builder.rs index 64ac93f81..1903dfd8e 100644 --- a/unixfs/src/dir/builder.rs +++ b/unixfs/src/dir/builder.rs @@ -68,6 +68,8 @@ pub enum TreeBuildingFailed { RootedPath(String), /// The given full path contained empty segment. RepeatSlashesInPath(String), + /// The given full path ends in slash. + PathEndsInSlash(String), /// If the `BufferingTreeBuilder` was created without `TreeOptions` with the option `wrap in /// directory` enabled, then there can be only a single element at the root. TooManyRootLevelEntries, @@ -84,6 +86,7 @@ impl fmt::Display for TreeBuildingFailed { match self { RootedPath(s) => write!(fmt, "path is rooted: {:?}", s), RepeatSlashesInPath(s) => write!(fmt, "path contains repeat slashes: {:?}", s), + PathEndsInSlash(s) => write!(fmt, "path ends in a slash: {:?}", s), TooManyRootLevelEntries => write!( fmt, "multiple root level entries while configured wrap_with_directory = false" diff --git a/unixfs/src/dir/builder/buffered.rs b/unixfs/src/dir/builder/buffered.rs index c48e22f48..f6b641e72 100644 --- a/unixfs/src/dir/builder/buffered.rs +++ b/unixfs/src/dir/builder/buffered.rs @@ -86,6 +86,18 @@ impl BufferingTreeBuilder { let mut remaining = full_path.split('/').enumerate().peekable(); let mut dir_builder = &mut self.root_builder; + // check these before to avoid creation of bogus nodes in the tree or having to clean up. + + if full_path.ends_with('/') { + return Err(TreeBuildingFailed::PathEndsInSlash(full_path.to_string())); + } + + if full_path.contains("//") { + return Err(TreeBuildingFailed::RepeatSlashesInPath( + full_path.to_string(), + )); + } + // needed to avoid borrowing into the DirBuilder::new calling closure let counter = &mut self.counter; @@ -94,16 +106,19 @@ impl BufferingTreeBuilder { match (depth, next, last) { // this might need to be accepted in case there is just a single file - (0, "", true) => { /* accepted */ } - (0, "", false) => { - return Err(TreeBuildingFailed::RootedPath(full_path.to_string())) + (0, "", true) => { + // accepted: allows unconditional tree building in ipfs-http + // but the resulting tree will have at most single node, which doesn't prompt + // creation of new directories and should be fine. } - (_, "", false) => { - return Err(TreeBuildingFailed::RepeatSlashesInPath( - full_path.to_string(), - )) + (0, "", false) => { + // ok to keep this inside the loop; we are yet to create any + // note the ipfs-http (and for example js-ipfs) normalizes the path by + // removing the slash from the start. + return Err(TreeBuildingFailed::RootedPath(full_path.to_string())); } - (_, "", true) => todo!("path ends in slash"), + (_, "", false) => unreachable!("already validated: no repeat slashes"), + (_, "", true) => unreachable!("already validated: path does not end in slash"), _ => {} } From 1c71e3c08d7f10d4daf6a2be309e5ed3f74298e8 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 6 Aug 2020 17:33:18 +0300 Subject: [PATCH 24/57] refactor: tests repeated the verify with block dump fn --- unixfs/src/dir/builder/buffered.rs | 63 ++++++++++++++---------------- 1 file changed, 29 insertions(+), 34 deletions(-) diff --git a/unixfs/src/dir/builder/buffered.rs b/unixfs/src/dir/builder/buffered.rs index f6b641e72..f04652bec 100644 --- a/unixfs/src/dir/builder/buffered.rs +++ b/unixfs/src/dir/builder/buffered.rs @@ -227,12 +227,12 @@ mod tests { let mut buffer = Vec::new(); let iter = builder.build(&mut full_path, &mut buffer); - let mut actual = iter + let actual = iter .map(|res| res.map(|n| (n.path, n.cid, n.block))) .collect::, _>>() .unwrap(); - let mut expected = vec![ + let expected = vec![ ( "a/b/c/d/e/f", "Qmbgf44ztW9wLcGNRNYGinGQB6SQDQtbHVbkM5MrWms698", @@ -247,34 +247,7 @@ mod tests { ("a", "QmSTUFaPwJW8xD4KNRLLQRqVTYtYC29xuhYTJoYPWdzvKp"), ]; - // hopefully this way the errors will be easier to hunt down - - actual.reverse(); - expected.reverse(); - - while let Some(actual) = actual.pop() { - let expected = expected.pop().expect("size mismatch"); - assert_eq!(actual.0, expected.0); - assert_eq!( - actual.1.to_string(), - expected.1, - "{:?}: {:?}", - actual.0, - Hex(&actual.2) - ); - } - } - - struct Hex<'a>(&'a [u8]); - use std::fmt; - - impl<'a> fmt::Debug for Hex<'a> { - fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { - for b in self.0 { - write!(fmt, "{:02x}", b)?; - } - Ok(()) - } + verify_results(expected, actual); } #[test] @@ -445,12 +418,32 @@ mod tests { let mut buffer = Vec::new(); let iter = builder.build(&mut full_path, &mut buffer); - let mut actual = iter + let actual = iter .map(|res| res.map(|n| (n.path, n.cid, n.block))) .collect::, _>>() .unwrap(); - let mut expected = vec![("a", "QmPMDMPG8dbHDC9GuvqWr9pfruLnp4GZCAWrskwCmenVQa")]; + let expected = vec![("a", "QmPMDMPG8dbHDC9GuvqWr9pfruLnp4GZCAWrskwCmenVQa")]; + + verify_results(expected, actual); + } + + fn verify_results( + mut expected: Vec<(impl AsRef, impl AsRef)>, + mut actual: Vec<(String, Cid, Box<[u8]>)>, + ) { + use std::fmt; + + struct Hex<'a>(&'a [u8]); + + impl<'a> fmt::Debug for Hex<'a> { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + for b in self.0 { + write!(fmt, "{:02x}", b)?; + } + Ok(()) + } + } // hopefully this way the errors will be easier to hunt down @@ -459,15 +452,17 @@ mod tests { while let Some(actual) = actual.pop() { let expected = expected.pop().expect("size mismatch"); - assert_eq!(actual.0, expected.0); + assert_eq!(actual.0, expected.0.as_ref()); assert_eq!( actual.1.to_string(), - expected.1, + expected.1.as_ref(), "{:?}: {:?}", actual.0, Hex(&actual.2) ); } + + assert_eq!(expected.len(), 0); } /// Returns a quick and dirty sha2-256 of the given number as a Cidv0 From 39583a77f1da06e7afdceb042bb70f3ca2e5366f Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 6 Aug 2020 17:44:32 +0300 Subject: [PATCH 25/57] refactor: remove extra initialization --- unixfs/src/dir/builder/iter.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/unixfs/src/dir/builder/iter.rs b/unixfs/src/dir/builder/iter.rs index e682400f6..2f092fca0 100644 --- a/unixfs/src/dir/builder/iter.rs +++ b/unixfs/src/dir/builder/iter.rs @@ -175,7 +175,6 @@ impl<'a> PostOrderIterator<'a> { links, data: UnixFs { Type: UnixFsType::Directory, - Data: None, ..Default::default() }, }; From 7bbff65bddf7f0ca1c56d14d109ae71e38892fb1 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 6 Aug 2020 17:47:43 +0300 Subject: [PATCH 26/57] fix: DirBuilder::is_empty --- unixfs/src/dir/builder/buffered.rs | 2 +- unixfs/src/dir/builder/dir_builder.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/unixfs/src/dir/builder/buffered.rs b/unixfs/src/dir/builder/buffered.rs index f04652bec..0185da59a 100644 --- a/unixfs/src/dir/builder/buffered.rs +++ b/unixfs/src/dir/builder/buffered.rs @@ -123,7 +123,7 @@ impl BufferingTreeBuilder { } // our first level can be full given the options - let full = depth == 0 && !self.opts.wrap_with_directory && dir_builder.is_empty(); + let full = depth == 0 && !self.opts.wrap_with_directory && !dir_builder.is_empty(); if last { let mut next_id = Some(*counter); diff --git a/unixfs/src/dir/builder/dir_builder.rs b/unixfs/src/dir/builder/dir_builder.rs index 21c28a316..94a341e53 100644 --- a/unixfs/src/dir/builder/dir_builder.rs +++ b/unixfs/src/dir/builder/dir_builder.rs @@ -69,7 +69,7 @@ impl DirBuilder { } pub fn is_empty(&self) -> bool { - self.len() != 0 + self.len() == 0 } pub fn set_metadata(&mut self, metadata: Metadata) { From 7d7915a9dacdb44e477618953ab4882e524a5dfc Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 6 Aug 2020 18:01:48 +0300 Subject: [PATCH 27/57] doc: remove mention of wrap in => wrap with --- unixfs/src/dir/builder.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unixfs/src/dir/builder.rs b/unixfs/src/dir/builder.rs index 1903dfd8e..ace2ec8d3 100644 --- a/unixfs/src/dir/builder.rs +++ b/unixfs/src/dir/builder.rs @@ -70,8 +70,8 @@ pub enum TreeBuildingFailed { RepeatSlashesInPath(String), /// The given full path ends in slash. PathEndsInSlash(String), - /// If the `BufferingTreeBuilder` was created without `TreeOptions` with the option `wrap in - /// directory` enabled, then there can be only a single element at the root. + /// If the `BufferingTreeBuilder` was created without `TreeOptions` with the option + /// `wrap_with_directory` enabled, then there can be only a single element at the root. TooManyRootLevelEntries, /// The given full path had already been added. DuplicatePath(String), From 5d8b1243675f6d7d4da83f29e5dbb87da413c3a7 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 6 Aug 2020 18:10:04 +0300 Subject: [PATCH 28/57] doc: suggestions from code review Thanks to ljedrz for the many many fixes. Co-authored-by: ljedrz --- http/src/v0/root_files.rs | 4 ++-- http/src/v0/root_files/add.rs | 20 ++++++++++---------- unixfs/src/dir/builder.rs | 4 ++-- unixfs/src/dir/builder/buffered.rs | 16 ++++++++-------- unixfs/src/dir/builder/iter.rs | 14 +++++++------- 5 files changed, 29 insertions(+), 29 deletions(-) diff --git a/http/src/v0/root_files.rs b/http/src/v0/root_files.rs index ac85c4233..7a9b5e4ab 100644 --- a/http/src/v0/root_files.rs +++ b/http/src/v0/root_files.rs @@ -23,10 +23,10 @@ mod add; #[derive(Debug, Deserialize)] pub struct AddArgs { - // probably never interesting + // unknown meaning; ignoring it doesn't fail any tests #[serde(default, rename = "stream-channels")] stream_channels: bool, - // unsure what this does + // progress reports totaling to the input file size #[serde(default)] progress: bool, /// When true, a new directory is created to hold more than 1 root level directories. diff --git a/http/src/v0/root_files/add.rs b/http/src/v0/root_files/add.rs index ebe98316a..ece9dbde1 100644 --- a/http/src/v0/root_files/add.rs +++ b/http/src/v0/root_files/add.rs @@ -35,7 +35,7 @@ pub(super) async fn add_inner( let st = add_stream(ipfs, st, opts); - // map the errors into json objects at least as we cannot return them as trailers (yet) + // map the errors into json objects; as we can't return them as trailers yet let st = st.map(|res| match res { passthrough @ Ok(_) | passthrough @ Err(AddError::ResponseSerialization(_)) => { @@ -46,7 +46,7 @@ pub(super) async fn add_inner( } Err(something_else) => { let msg = crate::v0::support::MessageResponseBuilder::default() - .with_message(format!("{}", something_else)); + .with_message(something_else.to_string()); let bytes: Bytes = serde_json::to_vec(&msg) .expect("serializing here should not have failed") .into(); @@ -134,7 +134,7 @@ where // test cases post with paths '/some-directory/...' and others post with // 'some-directory/...'. - // since slash is single code point we can just + // since slash is a single code point, we can just do filename[1..].to_owned() } else { filename.into_owned() @@ -145,7 +145,7 @@ where let next = match content_type { "application/octet-stream" => { - // files are file{,-1,-2,-3,..} + // files are of the form "file-{1,2,3,..}" let _ = if field_name != "file" && !field_name.starts_with("file-") { Err(AddError::UnsupportedField(field_name.to_string())) } else { @@ -153,9 +153,9 @@ where }?; let mut adder = FileAdder::default(); - // how much of bytes have we stored as blocks + // how many bytes we have stored as blocks let mut total_written = 0u64; - // how much of bytes have we read of input + // how many bytes of input we have read let mut total_read = 0u64; loop { @@ -186,7 +186,7 @@ where if saved_any && opts.progress { // technically we could just send messages but that'd - // need us to let go using Cow's or use Arc or + // require us to stop using Cow's and use Arc or // similar. not especially fond of either. serde_json::to_writer((&mut buffer).writer(), &Response::Progress { name: Cow::Borrowed(&filename), @@ -248,7 +248,7 @@ where Ok(buffer.split().freeze()) }, "application/x-directory" => { - // dirs are dir{,-1,-2,-3,..} + // dirs are of the form "dir-{1,2,3,..}" let _ = if field_name != "dir" && !field_name.starts_with("dir-") { Err(AddError::UnsupportedField(field_name.to_string())) } else { @@ -259,9 +259,9 @@ where // except for the already parsed *but* ignored headers while field.try_next().await.map_err(AddError::Parsing)?.is_some() {} - // while we don't at the moment parse the mtime, mtime-nsec headers and mode + // while at the moment we don't parse the mtime, mtime-nsec headers and mode // those should be reflected in the metadata. this will still add an empty - // directory which is good thing. + // directory which is a good thing. tree.set_metadata(&filename, ipfs::unixfs::ll::Metadata::default()) .map_err(AddError::TreeGathering)?; continue; diff --git a/unixfs/src/dir/builder.rs b/unixfs/src/dir/builder.rs index ace2ec8d3..76f91a43f 100644 --- a/unixfs/src/dir/builder.rs +++ b/unixfs/src/dir/builder.rs @@ -66,7 +66,7 @@ impl TreeOptions { pub enum TreeBuildingFailed { /// The given full path started with a slash; paths in the `/add` convention are not rooted. RootedPath(String), - /// The given full path contained empty segment. + /// The given full path contained an empty segment. RepeatSlashesInPath(String), /// The given full path ends in slash. PathEndsInSlash(String), @@ -91,7 +91,7 @@ impl fmt::Display for TreeBuildingFailed { fmt, "multiple root level entries while configured wrap_with_directory = false" ), - // TODO: perhaps we should allow adding two leafs with same Cid? + // TODO: perhaps we should allow adding two leafs with the same Cid? DuplicatePath(s) => write!(fmt, "path exists already: {:?}", s), LeafAsDirectory(s) => write!( fmt, diff --git a/unixfs/src/dir/builder/buffered.rs b/unixfs/src/dir/builder/buffered.rs index 0185da59a..3ea13f992 100644 --- a/unixfs/src/dir/builder/buffered.rs +++ b/unixfs/src/dir/builder/buffered.rs @@ -3,15 +3,15 @@ use crate::Metadata; use cid::Cid; use std::collections::hash_map::Entry::*; -/// UnixFs directory tree builder, which buffers entries until `build()` is called. +/// UnixFs directory tree builder which buffers entries until `build()` is called. #[derive(Debug)] pub struct BufferingTreeBuilder { /// At the root there can be only one element, unless an option was given to create a new /// directory surrounding the root elements. root_builder: DirBuilder, longest_path: usize, - // used to generate each node an unique id which is used when doing the post order traversal to - // recover all childrens rendered Cids + // used to generate a unique id for each node; it is used when doing the post order traversal to + // recover all children's rendered Cids counter: u64, opts: TreeOptions, } @@ -33,7 +33,7 @@ impl BufferingTreeBuilder { } } - /// Records the give path to be a link to the following cid. + /// Registers the given path to be a link to the cid that follows. /// /// FIXME: this should be renamed as "put_leaf" or "put_opaque_leaf". pub fn put_file( @@ -80,7 +80,7 @@ impl BufferingTreeBuilder { { // create all paths along the way // - // assuming it's ok to split '/' since that cannot be escaped in linux at least + // assuming it's ok to split at '/' since that cannot be escaped in linux at least self.longest_path = full_path.len().max(self.longest_path); let mut remaining = full_path.split('/').enumerate().peekable(); @@ -122,7 +122,7 @@ impl BufferingTreeBuilder { _ => {} } - // our first level can be full given the options + // our first level can be full, depending on the options given let full = depth == 0 && !self.opts.wrap_with_directory && !dir_builder.is_empty(); if last { @@ -172,12 +172,12 @@ impl BufferingTreeBuilder { } /// Called to build the tree. The built tree will have the added files and their implied - /// directory structure, along with the any directory entries which were created using + /// directory structure, along with the directory entries which were created using /// `set_metadata`. To build the whole hierarchy, one must iterate the returned iterator to /// completion while storing the created blocks. /// /// Returned `PostOrderIterator` will use the given `full_path` and `block_buffer` to store - /// it's data during the walk. `PostOrderIterator` implements `Iterator` while also allowing + /// its data during the walk. `PostOrderIterator` implements `Iterator` while also allowing /// borrowed access via `next_borrowed`. pub fn build<'a>( self, diff --git a/unixfs/src/dir/builder/iter.rs b/unixfs/src/dir/builder/iter.rs index 2f092fca0..00fcb9873 100644 --- a/unixfs/src/dir/builder/iter.rs +++ b/unixfs/src/dir/builder/iter.rs @@ -4,8 +4,8 @@ use std::collections::{BTreeMap, HashMap}; /// Constructs the directory nodes required for a tree. /// -/// Implements the Iterator interface for owned values and the borrowed version `next_borrowed`. -/// Tree is fully constructed once this has been exhausted. +/// Implements the Iterator interface for owned values and the borrowed version, `next_borrowed`. +/// The tree is fully constructed once this has been exhausted. pub struct PostOrderIterator<'a> { pub(super) full_path: &'a mut String, pub(super) old_depth: usize, @@ -102,7 +102,7 @@ impl<'a> PostOrderIterator<'a> { struct WriteableCid<'a>(&'a Cid); - // Cid by default does not have a way to count it's length or just write it out without + // Cid by default does not have a way to count its length or just write it out without // allocating a vector. impl<'a> MessageWrite for WriteableCid<'a> { fn get_size(&self) -> usize { @@ -278,7 +278,7 @@ impl<'a> PostOrderIterator<'a> { if !self.wrap_with_directory && parent_id.is_none() { // we aren't supposed to wrap_with_directory, and we are now looking at the - // possibly to be generated root directory. + // possibly to-be-generated root directory. assert_eq!( collected.len(), @@ -303,7 +303,7 @@ impl<'a> PostOrderIterator<'a> { collected.clear(); if let Some(name) = name { - // name is none only for the wrap_with_directory, which cannot really be + // name is None only for wrap_with_directory, which cannot really be // propagated up but still the parent_id is allowed to be None let previous = self .persisted_cids @@ -378,14 +378,14 @@ fn update_full_path( depth: usize, ) { if depth < 2 { - // initially thought it might be good idea to add slash to all components; removing it made + // initially thought it might be a good idea to add a slash to all components; removing it made // it impossible to get back down to empty string, so fixing this for depths 0 and 1. full_path.clear(); *old_depth = 0; } else { while *old_depth >= depth && *old_depth > 0 { // we now want to pop the last segment - // this would be easier with pathbuf + // this would be easier with PathBuf let slash_at = full_path.bytes().rposition(|ch| ch == b'/'); if let Some(slash_at) = slash_at { full_path.truncate(slash_at); From b134593708229fafa1d4251c3ca61abe958fe54c Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 6 Aug 2020 18:16:19 +0300 Subject: [PATCH 29/57] doc: minor too short sentence --- unixfs/src/dir/builder/buffered.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unixfs/src/dir/builder/buffered.rs b/unixfs/src/dir/builder/buffered.rs index 3ea13f992..a9d2d5c50 100644 --- a/unixfs/src/dir/builder/buffered.rs +++ b/unixfs/src/dir/builder/buffered.rs @@ -112,7 +112,7 @@ impl BufferingTreeBuilder { // creation of new directories and should be fine. } (0, "", false) => { - // ok to keep this inside the loop; we are yet to create any + // ok to keep this inside the loop; we are yet to create any nodes. // note the ipfs-http (and for example js-ipfs) normalizes the path by // removing the slash from the start. return Err(TreeBuildingFailed::RootedPath(full_path.to_string())); From 761124884fbd0b6bf09975cc9e1fc54c614b91ff Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 6 Aug 2020 18:34:25 +0300 Subject: [PATCH 30/57] feat: block size limit --- unixfs/src/dir/builder.rs | 34 ++++++++++++++++++++++++++++++---- unixfs/src/dir/builder/iter.rs | 23 ++++++++++++++++++----- 2 files changed, 48 insertions(+), 9 deletions(-) diff --git a/unixfs/src/dir/builder.rs b/unixfs/src/dir/builder.rs index 76f91a43f..240ae0493 100644 --- a/unixfs/src/dir/builder.rs +++ b/unixfs/src/dir/builder.rs @@ -48,12 +48,29 @@ impl fmt::Debug for Leaf { } /// Configuration for customizing how the tree is built. -#[derive(Default, Debug, Clone)] +#[derive(Debug, Clone)] pub struct TreeOptions { + block_size_limit: Option, wrap_with_directory: bool, } +impl Default for TreeOptions { + fn default() -> Self { + TreeOptions { + // this is just a guess; our bitswap message limit is a bit more + block_size_limit: Some(512 * 1024), + wrap_with_directory: false, + } + } +} + impl TreeOptions { + /// Overrides the default directory block size limit. If the size limit is set to `None`, no + /// directory will be too large. + pub fn block_size_limit(&mut self, limit: Option) { + self.block_size_limit = limit; + } + /// When true, allow multiple top level entries, otherwise error on the second entry. /// Defaults to false. pub fn wrap_with_directory(&mut self) { @@ -123,12 +140,21 @@ enum Visited { /// Failure cases for `PostOrderIterator` creating the tree dag-pb nodes. #[derive(Debug)] pub enum TreeConstructionFailed { - // TODO: at least any quick_protobuf errors here? + /// Failed to serialize the protobuf node for the directory + Protobuf(quick_protobuf::Error), + /// The resulting directory would be too large and HAMT sharding is yet to be implemented or + /// denied. + TooLargeBlock(u64), } impl fmt::Display for TreeConstructionFailed { - fn fmt(&self, _fmt: &mut fmt::Formatter<'_>) -> fmt::Result { - todo!() + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + use TreeConstructionFailed::*; + + match self { + Protobuf(e) => write!(fmt, "serialization failed: {}", e), + TooLargeBlock(size) => write!(fmt, "attempted to create block of {} bytes", size), + } } } diff --git a/unixfs/src/dir/builder/iter.rs b/unixfs/src/dir/builder/iter.rs index 00fcb9873..1fc759704 100644 --- a/unixfs/src/dir/builder/iter.rs +++ b/unixfs/src/dir/builder/iter.rs @@ -18,7 +18,7 @@ pub struct PostOrderIterator<'a> { pub(super) cid: Option, pub(super) total_size: u64, // from TreeOptions - pub(super) wrap_with_directory: bool, + pub(super) opts: TreeOptions, } impl<'a> PostOrderIterator<'a> { @@ -39,13 +39,14 @@ impl<'a> PostOrderIterator<'a> { reused_children: Vec::new(), cid: None, total_size: 0, - wrap_with_directory: opts.wrap_with_directory, + opts, } } fn render_directory( links: &BTreeMap, buffer: &mut Vec, + block_size_limit: &Option, ) -> Result { use crate::pb::{UnixFs, UnixFsType}; use quick_protobuf::{BytesWriter, MessageWrite, Writer, WriterBackend}; @@ -181,6 +182,14 @@ impl<'a> PostOrderIterator<'a> { let size = btreed.get_size(); + if let Some(limit) = block_size_limit { + let size = size as u64; + if *limit < size { + // FIXME: this could probably be detected at + return Err(TreeConstructionFailed::TooLargeBlock(size)); + } + } + // FIXME: we shouldn't be creating too large structures (bitswap block size limit!) // FIXME: changing this to autosharding is going to take some thinking @@ -197,7 +206,7 @@ impl<'a> PostOrderIterator<'a> { let mut writer = Writer::new(BytesWriter::new(&mut buffer[..])); btreed .write_message(&mut writer) - .expect("unsure how this could fail"); + .map_err(TreeConstructionFailed::Protobuf)?; buffer.truncate(size); @@ -276,7 +285,7 @@ impl<'a> PostOrderIterator<'a> { // FIXME: leaves could be drained and reused collected.extend(leaves); - if !self.wrap_with_directory && parent_id.is_none() { + if !self.opts.wrap_with_directory && parent_id.is_none() { // we aren't supposed to wrap_with_directory, and we are now looking at the // possibly to-be-generated root directory. @@ -291,7 +300,11 @@ impl<'a> PostOrderIterator<'a> { let buffer = &mut self.block_buffer; - let leaf = match Self::render_directory(&collected, buffer) { + let leaf = match Self::render_directory( + &collected, + buffer, + &self.opts.block_size_limit, + ) { Ok(leaf) => leaf, Err(e) => return Some(Err(e)), }; From f24f9bda420bbc97860bdab3f075e39b7d3b6023 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 6 Aug 2020 18:34:45 +0300 Subject: [PATCH 31/57] refactor: reorder items, fix doc --- unixfs/src/dir/builder/iter.rs | 75 +++++++++++++++++----------------- 1 file changed, 37 insertions(+), 38 deletions(-) diff --git a/unixfs/src/dir/builder/iter.rs b/unixfs/src/dir/builder/iter.rs index 1fc759704..22b28eb46 100644 --- a/unixfs/src/dir/builder/iter.rs +++ b/unixfs/src/dir/builder/iter.rs @@ -60,51 +60,15 @@ impl<'a> PostOrderIterator<'a> { // // the nested post order visit should probably re-use the existing infra ("message // passing") and new ids can be generated by giving this iterator the counter from - // BufferedTreeWriter. + // BufferedTreeBuilder. // // could also be that the HAMT shard building should start earlier, since the same // heuristic can be detected *at* bufferedtreewriter. there the split would be easier, and // this would "just" be a single node rendering, and not need any additional states.. - /// Newtype which uses the BTreeMap as Vec. - struct BTreeMappedDir<'a> { - links: &'a BTreeMap, - data: UnixFs<'a>, - } - - /// Newtype which represents an entry from BTreeMap as PBLink as far as the - /// protobuf representation goes. - struct EntryAsPBLink<'a>(&'a String, &'a Leaf); - - impl<'a> MessageWrite for EntryAsPBLink<'a> { - fn get_size(&self) -> usize { - use quick_protobuf::sizeofs::*; - - // ones are the tags - 1 + sizeof_len(self.0.len()) - + 1 - //+ sizeof_len(WriteableCid(&self.1.link).get_size()) - + sizeof_len(self.1.link.to_bytes().len()) - + 1 - + sizeof_varint(self.1.total_size) - } - - fn write_message( - &self, - w: &mut Writer, - ) -> quick_protobuf::Result<()> { - // w.write_with_tag(10, |w| w.write_message(&WriteableCid(&self.1.link)))?; - w.write_with_tag(10, |w| w.write_bytes(&self.1.link.to_bytes()))?; - w.write_with_tag(18, |w| w.write_string(self.0.as_str()))?; - w.write_with_tag(24, |w| w.write_uint64(self.1.total_size))?; - Ok(()) - } - } - + /// Newtype around Cid to allow embedding it as PBLink::Hash without allocating a vector. struct WriteableCid<'a>(&'a Cid); - // Cid by default does not have a way to count its length or just write it out without - // allocating a vector. impl<'a> MessageWrite for WriteableCid<'a> { fn get_size(&self) -> usize { use cid::Version::*; @@ -148,6 +112,41 @@ impl<'a> PostOrderIterator<'a> { } } + /// Newtype which uses the BTreeMap as Vec. + struct BTreeMappedDir<'a> { + links: &'a BTreeMap, + data: UnixFs<'a>, + } + + /// Newtype which represents an entry from BTreeMap as PBLink as far as the + /// protobuf representation goes. + struct EntryAsPBLink<'a>(&'a String, &'a Leaf); + + impl<'a> MessageWrite for EntryAsPBLink<'a> { + fn get_size(&self) -> usize { + use quick_protobuf::sizeofs::*; + + // ones are the tags + 1 + sizeof_len(self.0.len()) + + 1 + //+ sizeof_len(WriteableCid(&self.1.link).get_size()) + + sizeof_len(self.1.link.to_bytes().len()) + + 1 + + sizeof_varint(self.1.total_size) + } + + fn write_message( + &self, + w: &mut Writer, + ) -> quick_protobuf::Result<()> { + // w.write_with_tag(10, |w| w.write_message(&WriteableCid(&self.1.link)))?; + w.write_with_tag(10, |w| w.write_bytes(&self.1.link.to_bytes()))?; + w.write_with_tag(18, |w| w.write_string(self.0.as_str()))?; + w.write_with_tag(24, |w| w.write_uint64(self.1.total_size))?; + Ok(()) + } + } + impl<'a> MessageWrite for BTreeMappedDir<'a> { fn get_size(&self) -> usize { use quick_protobuf::sizeofs::*; From 684ecdc9d3bf0c0a647251cdf97e9e4dc40e2733 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 6 Aug 2020 21:47:59 +0300 Subject: [PATCH 32/57] doc: note lld-9 ulimit -s bisection --- conformance/rust.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/conformance/rust.sh b/conformance/rust.sh index 95e9a5689..84467a69d 100755 --- a/conformance/rust.sh +++ b/conformance/rust.sh @@ -31,6 +31,16 @@ on_killed () { echo ">>>> new execution $$ with args: $@" | tee -a /tmp/rust.log >&2 killed=true +# 256 crashes at p2p swarm init +# 300 at behaviour building +# 350 built the threadpool +# 375 p2p init +# 387 kad init +# 390 ok +# 393 ok +# 400 ok +# 450 ok +ulimit -s 8192 -c unlimited ./http "$@" 2>&1 | tee -a /tmp/rust.log || retval=$? killed=false echo "<<<< exiting $$ with $retval" | tee -a /tmp/rust.log >&2 From 41566e47596d1ee3ab5dada43089e935e12c95e1 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Fri, 7 Aug 2020 14:45:34 +0300 Subject: [PATCH 33/57] refactor(postorderiter): remove redundant visibilities --- unixfs/src/dir/builder/iter.rs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/unixfs/src/dir/builder/iter.rs b/unixfs/src/dir/builder/iter.rs index 22b28eb46..432c8a060 100644 --- a/unixfs/src/dir/builder/iter.rs +++ b/unixfs/src/dir/builder/iter.rs @@ -7,18 +7,18 @@ use std::collections::{BTreeMap, HashMap}; /// Implements the Iterator interface for owned values and the borrowed version, `next_borrowed`. /// The tree is fully constructed once this has been exhausted. pub struct PostOrderIterator<'a> { - pub(super) full_path: &'a mut String, - pub(super) old_depth: usize, - pub(super) block_buffer: &'a mut Vec, + full_path: &'a mut String, + old_depth: usize, + block_buffer: &'a mut Vec, // our stack of pending work - pub(super) pending: Vec, + pending: Vec, // "communication channel" from nested entries back to their parents - pub(super) persisted_cids: HashMap, BTreeMap>, - pub(super) reused_children: Vec, - pub(super) cid: Option, - pub(super) total_size: u64, + persisted_cids: HashMap, BTreeMap>, + reused_children: Vec, + cid: Option, + total_size: u64, // from TreeOptions - pub(super) opts: TreeOptions, + opts: TreeOptions, } impl<'a> PostOrderIterator<'a> { From fdc5f8aa84cd46950e1993e61f6eccf74ca2b6e4 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Fri, 7 Aug 2020 14:54:34 +0300 Subject: [PATCH 34/57] refactor: make PostOrderIterator 'static kind of forgot the String and Vec while trying to impl the Iterator trait to return non-static values. --- http/src/v0/root_files/add.rs | 5 +-- unixfs/src/dir/builder/buffered.rs | 50 ++++++++---------------------- unixfs/src/dir/builder/iter.rs | 25 ++++++--------- 3 files changed, 23 insertions(+), 57 deletions(-) diff --git a/http/src/v0/root_files/add.rs b/http/src/v0/root_files/add.rs index ece9dbde1..ac09d970d 100644 --- a/http/src/v0/root_files/add.rs +++ b/http/src/v0/root_files/add.rs @@ -274,10 +274,7 @@ where yield next; } - let mut full_path = String::new(); - let mut block_buffer = Vec::new(); - - let mut iter = tree.build(&mut full_path, &mut block_buffer); + let mut iter = tree.build(); while let Some(res) = iter.next_borrowed() { let TreeNode { path, cid, total_size, block } = res.map_err(AddError::TreeBuilding)?; diff --git a/unixfs/src/dir/builder/buffered.rs b/unixfs/src/dir/builder/buffered.rs index a9d2d5c50..095293a62 100644 --- a/unixfs/src/dir/builder/buffered.rs +++ b/unixfs/src/dir/builder/buffered.rs @@ -179,19 +179,13 @@ impl BufferingTreeBuilder { /// Returned `PostOrderIterator` will use the given `full_path` and `block_buffer` to store /// its data during the walk. `PostOrderIterator` implements `Iterator` while also allowing /// borrowed access via `next_borrowed`. - pub fn build<'a>( - self, - full_path: &'a mut String, - block_buffer: &'a mut Vec, - ) -> PostOrderIterator<'a> { + pub fn build(self) -> PostOrderIterator { PostOrderIterator::new( Visited::Descent { node: self.root_builder, name: None, depth: 0, }, - full_path, - block_buffer, self.opts, ) } @@ -223,11 +217,8 @@ mod tests { .put_file("a/b/c/d/e/i.txt", five_block_foobar, 221) .unwrap(); - let mut full_path = String::new(); - let mut buffer = Vec::new(); - - let iter = builder.build(&mut full_path, &mut buffer); - let actual = iter + let actual = builder + .build() .map(|res| res.map(|n| (n.path, n.cid, n.block))) .collect::, _>>() .unwrap(); @@ -255,11 +246,8 @@ mod tests { let mut builder = BufferingTreeBuilder::default(); builder.put_file("", some_cid(0), 1).unwrap(); - let mut full_path = String::new(); - let mut buffer = Vec::new(); - - let iter = builder.build(&mut full_path, &mut buffer); - let actual = iter + let actual = builder + .build() .map(|res| res.map(|OwnedTreeNode { path, .. }| path)) .collect::, _>>() .unwrap(); @@ -298,11 +286,8 @@ mod tests { .unwrap(); builder.put_file("b", five_block_foobar, 221).unwrap(); - let mut full_path = String::new(); - let mut buffer = Vec::new(); - - let iter = builder.build(&mut full_path, &mut buffer); - let actual = iter + let actual = builder + .build() .map(|res| res.map(|OwnedTreeNode { path, cid, .. }| (path, cid.to_string()))) .collect::, _>>() .unwrap(); @@ -327,11 +312,8 @@ mod tests { let mut builder = BufferingTreeBuilder::new(opts); builder.put_file("a", five_block_foobar, 221).unwrap(); - let mut full_path = String::new(); - let mut buffer = Vec::new(); - - let iter = builder.build(&mut full_path, &mut buffer); - let actual = iter + let actual = builder + .build() .map(|res| res.map(|OwnedTreeNode { path, cid, .. }| (path, cid.to_string()))) .collect::, _>>() .unwrap(); @@ -378,11 +360,8 @@ mod tests { builder.put_file("a/b/c/d/e.txt", some_cid(1), 1).unwrap(); builder.put_file("a/b/c/d/f.txt", some_cid(2), 1).unwrap(); - let mut full_path = String::new(); - let mut buffer = Vec::new(); - - let iter = builder.build(&mut full_path, &mut buffer); - let actual = iter + let actual = builder + .build() .map(|res| res.map(|OwnedTreeNode { path, .. }| path)) .collect::, _>>() .unwrap(); @@ -414,11 +393,8 @@ mod tests { let mut builder = BufferingTreeBuilder::default(); builder.put_file("a/b", target, 12).unwrap(); - let mut full_path = String::new(); - let mut buffer = Vec::new(); - - let iter = builder.build(&mut full_path, &mut buffer); - let actual = iter + let actual = builder + .build() .map(|res| res.map(|n| (n.path, n.cid, n.block))) .collect::, _>>() .unwrap(); diff --git a/unixfs/src/dir/builder/iter.rs b/unixfs/src/dir/builder/iter.rs index 432c8a060..e940905dd 100644 --- a/unixfs/src/dir/builder/iter.rs +++ b/unixfs/src/dir/builder/iter.rs @@ -6,10 +6,10 @@ use std::collections::{BTreeMap, HashMap}; /// /// Implements the Iterator interface for owned values and the borrowed version, `next_borrowed`. /// The tree is fully constructed once this has been exhausted. -pub struct PostOrderIterator<'a> { - full_path: &'a mut String, +pub struct PostOrderIterator { + full_path: String, old_depth: usize, - block_buffer: &'a mut Vec, + block_buffer: Vec, // our stack of pending work pending: Vec, // "communication channel" from nested entries back to their parents @@ -21,19 +21,12 @@ pub struct PostOrderIterator<'a> { opts: TreeOptions, } -impl<'a> PostOrderIterator<'a> { - pub(super) fn new( - root: Visited, - full_path: &'a mut String, - block_buffer: &'a mut Vec, - opts: TreeOptions, - ) -> Self { - full_path.clear(); - +impl PostOrderIterator { + pub(super) fn new(root: Visited, opts: TreeOptions) -> Self { PostOrderIterator { - full_path, + full_path: Default::default(), old_depth: 0, - block_buffer, + block_buffer: Default::default(), pending: vec![root], persisted_cids: Default::default(), reused_children: Vec::new(), @@ -233,7 +226,7 @@ impl<'a> PostOrderIterator<'a> { Visited::Post { name, depth, .. } => (name.as_deref(), *depth), }; - update_full_path((self.full_path, &mut self.old_depth), name, depth); + update_full_path((&mut self.full_path, &mut self.old_depth), name, depth); match visited { Visited::Descent { node, name, depth } => { @@ -339,7 +332,7 @@ impl<'a> PostOrderIterator<'a> { } } -impl<'a> Iterator for PostOrderIterator<'a> { +impl Iterator for PostOrderIterator { type Item = Result; fn next(&mut self) -> Option { From e38c1ff04cdd00ffea7f922f278cfa49bfbdea1a Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 6 Aug 2020 21:48:21 +0300 Subject: [PATCH 35/57] refactor: try splitting the huge func --- http/src/v0/root_files/add.rs | 47 ++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/http/src/v0/root_files/add.rs b/http/src/v0/root_files/add.rs index ac09d970d..bb8702458 100644 --- a/http/src/v0/root_files/add.rs +++ b/http/src/v0/root_files/add.rs @@ -166,23 +166,9 @@ where match next { Some(next) => { - let mut read = 0usize; - let mut saved_any = false; - - while read < next.len() { - let (iter, used) = adder.push(&next.slice(read..)); - read += used; - - let maybe_tuple = import_all(&ipfs, iter).await.map_err(AddError::Persisting)?; - - let subtotal = maybe_tuple.map(|t| t.1); - - total_written += subtotal.unwrap_or(0); - - saved_any |= subtotal.is_some(); - } - - total_read += read as u64; + let (read, saved_any, written) = push_all(&ipfs, &mut adder, next).await?; + total_written += written; + total_read += read; if saved_any && opts.progress { // technically we could just send messages but that'd @@ -295,6 +281,33 @@ where } } +async fn push_all( + ipfs: &Ipfs, + adder: &mut FileAdder, + next: Bytes, +) -> Result<(u64, bool, u64), AddError> { + let mut read = 0usize; + let mut saved_any = false; + let mut total_written = 0; + + while read < next.len() { + let (iter, used) = adder.push(&next.slice(read..)); + read += used; + + let maybe_tuple = import_all(&ipfs, iter) + .await + .map_err(AddError::Persisting)?; + + let subtotal = maybe_tuple.map(|t| t.1); + + total_written += subtotal.unwrap_or(0); + + saved_any |= subtotal.is_some(); + } + + Ok((read as u64, saved_any, total_written)) +} + async fn import_all( ipfs: &Ipfs, iter: impl Iterator)>, From 8c10a0ad0dbfe18b37b6feb7d5d5562286d43421 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Fri, 7 Aug 2020 21:05:38 +0300 Subject: [PATCH 36/57] fix: remove calling ulimit while this may have succeeded on github actions it probably isn't wise to run it every time at least automatically. --- conformance/rust.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conformance/rust.sh b/conformance/rust.sh index 84467a69d..86ef14d50 100755 --- a/conformance/rust.sh +++ b/conformance/rust.sh @@ -40,7 +40,7 @@ killed=true # 393 ok # 400 ok # 450 ok -ulimit -s 8192 -c unlimited +# ulimit -s 8192 -c unlimited ./http "$@" 2>&1 | tee -a /tmp/rust.log || retval=$? killed=false echo "<<<< exiting $$ with $retval" | tee -a /tmp/rust.log >&2 From a3f27e5b4688dfe30e3a35db94671f3ff71ebe95 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Fri, 7 Aug 2020 16:17:05 +0300 Subject: [PATCH 37/57] fix: move all path matching to v0, box handlers This patch changes the warp filter boxing strategy so, that: 1. paths are nested with a "prefix" matcher, if possible 2. the actual handler for the matched path is boxed 3. the non implemented but recognized paths are unified This strategy might be a good compromise between optimizations and compilation time but such measurements are yet to be done. Previous boxing strategy ended up being quite unfortunate. Upon adding the full `/add` implementation, the test cases using GNU binutils linker or ld 2.33 (ubuntu) started failing failing with a thread overflowing its stack. Most of the stack frames were warp filters for matching the path. This happened because the combine! macro used in http/src/v0.rs boxed (mostly) each filter, meaning the compiler could never see through any of the and requiring enough many frames at runtime to cause a stack overflow. An alternative of opt-level = 1 turned out to be easy and simple and was explored in #293 while this solution was being searched for. The downside is increased compilation time. --- http/src/v0.rs | 149 +++++++++++++++++++++++++------------- http/src/v0/bitswap.rs | 13 ++-- http/src/v0/block.rs | 36 ++++----- http/src/v0/dag.rs | 12 ++- http/src/v0/id.rs | 5 +- http/src/v0/pubsub.rs | 54 ++++---------- http/src/v0/refs.rs | 13 +--- http/src/v0/root_files.rs | 21 ++---- http/src/v0/support.rs | 3 +- http/src/v0/swarm.rs | 26 +++---- http/src/v0/version.rs | 6 +- 11 files changed, 165 insertions(+), 173 deletions(-) diff --git a/http/src/v0.rs b/http/src/v0.rs index 96b5eddaa..6b674592d 100644 --- a/http/src/v0.rs +++ b/http/src/v0.rs @@ -1,5 +1,4 @@ use ipfs::{Ipfs, IpfsTypes}; -use std::convert::Infallible; use warp::{query, Filter}; pub mod bitswap; @@ -16,84 +15,130 @@ pub mod support; pub use support::recover_as_message_response; pub(crate) use support::{with_ipfs, InvalidPeerId, NotImplemented, StringError}; -/// Helper to combine the multiple filters together with Filter::or, possibly boxing the types in +/// Helper to combine multiple filters together with Filter::or, possibly boxing the types in /// the process. This greatly helps the build times for `ipfs-http`. macro_rules! combine { - ($x:expr, $($y:expr),+) => { + ($x:expr, $($y:expr),+ $(,)?) => { { - let filter = boxed_on_debug!($x); + let filter = $x; $( - let filter = boxed_on_debug!(filter.or($y)); + let filter = filter.or($y); )+ filter } } } -#[cfg(debug_assertions)] +/// Helper to combine multiple filters together with Filter::or. The filters are never boxed but +/// the output is assumed to be equal and so the output is unified. +macro_rules! combine_unify { + ($x:expr, $($y:expr),+ $(,)?) => { + { + let filter = $x; + $( + let filter = filter.or($y).unify(); + )+ + filter + } + } +} + +/// Macro will cause boxing on debug builds. Might be a good idea to explore how much boxing always +/// would speed up builds. +#[cfg(not(debug_assertions))] macro_rules! boxed_on_debug { ($x:expr) => { - $x.boxed() + $x }; } -#[cfg(not(debug_assertions))] +#[cfg(debug_assertions)] macro_rules! boxed_on_debug { ($x:expr) => { - $x + $x.boxed() }; } +/// Like `Filter::and` but the next filter is boxed. This might be a good idea to combine path +/// matching to the route implementation while maintaining a healthy balance for compilation time +/// and optimization. +macro_rules! and_boxed { + ($x:expr, $y:expr) => { + ($x).and(boxed_on_debug!($y)) + }; +} + +/// Supported routes of the crate. pub fn routes( ipfs: &Ipfs, shutdown_tx: tokio::sync::mpsc::Sender<()>, -) -> impl warp::Filter + Clone { - let mount = warp::path("api").and(warp::path("v0")); - - let shutdown = warp::post() - .and(warp::path!("shutdown")) - .and(warp::any().map(move || shutdown_tx.clone())) - .and_then(handle_shutdown); +) -> impl warp::Filter + Clone { + let mount = warp::post().and(warp::path!("api" / "v0" / ..)); let api = mount.and(combine!( - shutdown, - id::identity(ipfs), - root_files::add(ipfs), - bitswap::wantlist(ipfs), - bitswap::stat(ipfs), - block::get(ipfs), - block::put(ipfs), - block::rm(ipfs), - block::stat(ipfs), - warp::path!("bootstrap" / ..).and_then(not_implemented), - warp::path!("config" / ..).and_then(not_implemented), - dag::put(ipfs), - dag::resolve(ipfs), - warp::path!("dht" / ..).and_then(not_implemented), - root_files::cat(ipfs), - root_files::get(ipfs), - warp::path!("key" / ..).and_then(not_implemented), - warp::path!("name" / ..).and_then(not_implemented), - warp::path!("object" / ..).and_then(not_implemented), - warp::path!("pin" / ..).and_then(not_implemented), - warp::path!("ping" / ..).and_then(not_implemented), - pubsub::routes(ipfs), - refs::local(ipfs), - refs::refs(ipfs), - warp::path!("repo" / ..).and_then(not_implemented), - warp::path!("stats" / ..).and_then(not_implemented), - swarm::connect(ipfs), - swarm::peers(ipfs), - swarm::addrs(ipfs), - swarm::addrs_local(ipfs), - swarm::disconnect(ipfs), + and_boxed!( + warp::path!("shutdown"), + warp::any() + .map(move || shutdown_tx.clone()) + .and_then(handle_shutdown) + ), + and_boxed!(warp::path!("id"), id::identity(ipfs)), + and_boxed!(warp::path!("add"), root_files::add(ipfs)), + and_boxed!(warp::path!("cat"), root_files::cat(ipfs)), + and_boxed!(warp::path!("get"), root_files::get(ipfs)), + and_boxed!(warp::path!("refs" / "local"), refs::local(ipfs)), + and_boxed!(warp::path!("refs"), refs::refs(ipfs)), warp::path!("version") .and(query::()) - .and_then(version::version) + .and_then(version::version), + warp::path("bitswap").and(combine!( + and_boxed!(warp::path!("wantlist"), bitswap::wantlist(ipfs)), + and_boxed!(warp::path!("stat"), bitswap::stat(ipfs)) + )), + warp::path("block").and(combine!( + and_boxed!(warp::path!("get"), block::get(ipfs)), + and_boxed!(warp::path!("put"), block::put(ipfs)), + and_boxed!(warp::path!("rm"), block::rm(ipfs)), + and_boxed!(warp::path!("stat"), block::stat(ipfs)), + )), + warp::path("dag").and(combine!( + and_boxed!(warp::path!("put"), dag::put(ipfs)), + and_boxed!(warp::path!("resolve"), dag::resolve(ipfs)), + )), + warp::path("pubsub").and(combine!( + and_boxed!(warp::path!("peers"), pubsub::peers(ipfs)), + and_boxed!(warp::path!("ls"), pubsub::list_subscriptions(ipfs)), + and_boxed!(warp::path!("pub"), pubsub::publish(ipfs)), + and_boxed!( + warp::path!("sub"), + pubsub::subscribe(ipfs, Default::default()) + ), + )), + warp::path("swarm").and(combine!( + and_boxed!(warp::path!("addrs" / "local"), swarm::addrs_local(ipfs)), + and_boxed!(warp::path!("addrs"), swarm::addrs(ipfs)), + and_boxed!(warp::path!("connect"), swarm::connect(ipfs)), + and_boxed!(warp::path!("disconnect"), swarm::disconnect(ipfs)), + and_boxed!(warp::path!("peers"), swarm::peers(ipfs)), + )), + combine_unify!( + warp::path!("bootstrap" / ..), + warp::path!("config" / ..), + warp::path!("dht" / ..), + warp::path!("key" / ..), + warp::path!("name" / ..), + warp::path!("object" / ..), + warp::path!("pin" / ..), + warp::path!("ping" / ..), + warp::path!("repo" / ..), + warp::path!("stats" / ..), + ) + .and_then(not_implemented), )); // have a common handler turn the rejections into 400 or 500 with json body - api.recover(recover_as_message_response) + // boxing this might save up to 15s. + boxed_on_debug!(api.recover(recover_as_message_response)) } pub(crate) async fn handle_shutdown( @@ -105,8 +150,8 @@ pub(crate) async fn handle_shutdown( }) } -async fn not_implemented() -> Result { - Ok(warp::http::StatusCode::NOT_IMPLEMENTED) +async fn not_implemented() -> Result<(impl warp::Reply,), std::convert::Infallible> { + Ok((warp::http::StatusCode::NOT_IMPLEMENTED,)) } #[cfg(test)] @@ -149,7 +194,7 @@ mod tests { async fn invalid_peer_id_as_messageresponse() { let routes = testing_routes().await; let resp = warp::test::request() - .method("GET") + .method("POST") .path("/api/v0/id?arg=foobar") .reply(&routes) .await; diff --git a/http/src/v0/bitswap.rs b/http/src/v0/bitswap.rs index 2ed139c96..b950ea060 100644 --- a/http/src/v0/bitswap.rs +++ b/http/src/v0/bitswap.rs @@ -2,7 +2,7 @@ use crate::v0::support::{with_ipfs, InvalidPeerId, StringError}; use ipfs::{BitswapStats, Ipfs, IpfsTypes}; use serde::{Deserialize, Serialize}; use serde_json::{json, Value}; -use warp::{path, query, reply, Filter, Rejection, Reply}; +use warp::{query, reply, Filter, Rejection, Reply}; #[derive(Debug, Deserialize)] pub struct WantlistQuery { @@ -39,9 +39,8 @@ async fn wantlist_query( pub fn wantlist( ipfs: &Ipfs, -) -> impl Filter + Clone { - path!("bitswap" / "wantlist") - .and(with_ipfs(ipfs)) +) -> impl Filter + Clone { + with_ipfs(ipfs) .and(query::()) .and_then(wantlist_query) } @@ -99,8 +98,6 @@ async fn stat_query(ipfs: Ipfs) -> Result( ipfs: &Ipfs, -) -> impl Filter + Clone { - path!("bitswap" / "stat") - .and(with_ipfs(ipfs)) - .and_then(stat_query) +) -> impl Filter + Clone { + with_ipfs(ipfs).and_then(stat_query) } diff --git a/http/src/v0/block.rs b/http/src/v0/block.rs index 3ce392831..ceae05232 100644 --- a/http/src/v0/block.rs +++ b/http/src/v0/block.rs @@ -12,7 +12,7 @@ use mime::Mime; use multihash::Multihash; use serde::{Deserialize, Serialize}; use std::convert::TryFrom; -use warp::{http::Response, path, query, reply, Filter, Rejection, Reply}; +use warp::{http::Response, query, reply, Filter, Rejection, Reply}; mod options; use options::RmOptions; @@ -44,9 +44,8 @@ async fn get_query( pub fn get( ipfs: &Ipfs, -) -> impl Filter + Clone { - path!("block" / "get") - .and(with_ipfs(ipfs)) +) -> impl Filter + Clone { + with_ipfs(ipfs) .and(query::()) .and_then(get_query) } @@ -88,9 +87,8 @@ impl PutQuery { pub fn put( ipfs: &Ipfs, -) -> impl Filter + Clone { - path!("block" / "put") - .and(with_ipfs(ipfs)) +) -> impl Filter + Clone { + with_ipfs(ipfs) .and(query::()) .and(warp::header::("content-type")) // TODO: rejects if missing .and(warp::body::stream()) @@ -151,11 +149,8 @@ pub struct EmptyResponse; pub fn rm( ipfs: &Ipfs, -) -> impl Filter + Clone { - path!("block" / "rm") - .and(with_ipfs(ipfs)) - .and(rm_options()) - .and_then(rm_query) +) -> impl Filter + Clone { + with_ipfs(ipfs).and(rm_options()).and_then(rm_query) } fn rm_options() -> impl Filter + Clone { @@ -221,6 +216,14 @@ async fn rm_query( Ok(StreamResponse(st)) } +pub fn stat( + ipfs: &Ipfs, +) -> impl Filter + Clone { + with_ipfs(ipfs) + .and(query::()) + .and_then(stat_query) +} + async fn stat_query( ipfs: Ipfs, query: GetStatOptions, @@ -238,12 +241,3 @@ async fn stat_query( "Size": block.data().len(), }))) } - -pub fn stat( - ipfs: &Ipfs, -) -> impl Filter + Clone { - path!("block" / "stat") - .and(with_ipfs(ipfs)) - .and(query::()) - .and_then(stat_query) -} diff --git a/http/src/v0/dag.rs b/http/src/v0/dag.rs index efc8a7525..55cafaef8 100644 --- a/http/src/v0/dag.rs +++ b/http/src/v0/dag.rs @@ -9,7 +9,7 @@ use mime::Mime; use serde::Deserialize; use serde_json::json; -use warp::{path, query, reply, Buf, Filter, Rejection, Reply}; +use warp::{query, reply, Buf, Filter, Rejection, Reply}; #[derive(Debug, Deserialize)] pub struct PutQuery { @@ -35,9 +35,8 @@ impl Default for InputEncoding { pub fn put( ipfs: &Ipfs, -) -> impl Filter + Clone { - path!("dag" / "put") - .and(with_ipfs(ipfs)) +) -> impl Filter + Clone { + with_ipfs(ipfs) .and(query::()) .and(warp::header::("content-type")) // TODO: rejects if missing .and(warp::body::stream()) @@ -106,9 +105,8 @@ async fn put_query( /// (rempath). pub fn resolve( ipfs: &Ipfs, -) -> impl Filter + Clone { - path!("dag" / "resolve") - .and(with_ipfs(ipfs)) +) -> impl Filter + Clone { + with_ipfs(ipfs) .and(query::()) .and_then(inner_resolve) } diff --git a/http/src/v0/id.rs b/http/src/v0/id.rs index bd79635a3..33a15c3fc 100644 --- a/http/src/v0/id.rs +++ b/http/src/v0/id.rs @@ -6,9 +6,8 @@ use warp::{query, Filter}; pub fn identity( ipfs: &Ipfs, -) -> impl Filter + Clone { - warp::path!("id") - .and(with_ipfs(ipfs)) +) -> impl Filter + Clone { + with_ipfs(ipfs) .and(optional_peer_id()) .and_then(identity_query) } diff --git a/http/src/v0/pubsub.rs b/http/src/v0/pubsub.rs index feb17101e..0696ee71e 100644 --- a/http/src/v0/pubsub.rs +++ b/http/src/v0/pubsub.rs @@ -40,26 +40,11 @@ pub struct Pubsub { Mutex>>>, } -/// Creates a filter composing pubsub/{peers,ls,pub,sub}. -pub fn routes( - ipfs: &Ipfs, -) -> impl warp::Filter + Clone { - warp::path("pubsub").and( - peers(ipfs) - .or(list_subscriptions(ipfs)) - .or(publish(ipfs)) - .or(subscribe(ipfs, Default::default())), - ) -} - /// Handling of https://docs-beta.ipfs.io/reference/http/api/#api-v0-pubsub-peers pub fn peers( ipfs: &Ipfs, -) -> impl warp::Filter + Clone { - warp::path!("peers") - .and(warp::get().or(warp::post())) - .unify() - .and(with_ipfs(ipfs)) +) -> impl warp::Filter + Clone { + with_ipfs(ipfs) .and(warp::query::().map(|tp: OptionalTopicParameter| tp.topic)) .and_then(inner_peers) } @@ -67,44 +52,38 @@ pub fn peers( async fn inner_peers( ipfs: Ipfs, topic: Option, -) -> Result { +) -> Result<(impl warp::Reply,), warp::Rejection> { let peers = ipfs .pubsub_peers(topic) .await .map_err(|e| warp::reject::custom(StringError::from(e)))?; - Ok(warp::reply::json(&StringListResponse { + Ok((warp::reply::json(&StringListResponse { strings: peers.into_iter().map(|id| id.to_string()).collect(), - })) + }),)) } /// Handling of https://docs-beta.ipfs.io/reference/http/api/#api-v0-pubsub-ls pub fn list_subscriptions( ipfs: &Ipfs, -) -> impl warp::Filter + Clone { - warp::path!("ls") - .and(warp::get().or(warp::post())) - .unify() - .and(with_ipfs(ipfs)) - .and_then(inner_ls) +) -> impl warp::Filter + Clone { + with_ipfs(ipfs).and_then(inner_ls) } -async fn inner_ls(ipfs: Ipfs) -> Result { +async fn inner_ls(ipfs: Ipfs) -> Result<(impl warp::Reply,), warp::Rejection> { let topics = ipfs .pubsub_subscribed() .await .map_err(|e| warp::reject::custom(StringError::from(e)))?; - Ok(warp::reply::json(&StringListResponse { strings: topics })) + Ok((warp::reply::json(&StringListResponse { strings: topics }),)) } /// Handling of https://docs-beta.ipfs.io/reference/http/api/#api-v0-pubsub-pub pub fn publish( ipfs: &Ipfs, -) -> impl warp::Filter + Clone { - warp::path!("pub") - .and(warp::post()) - .and(with_ipfs(ipfs)) +) -> impl warp::Filter + Clone { + with_ipfs(ipfs) .and(publish_args("arg")) .and_then(inner_publish) } @@ -112,11 +91,11 @@ pub fn publish( async fn inner_publish( ipfs: Ipfs, PublishArgs { topic, message }: PublishArgs, -) -> Result { +) -> Result<(impl warp::Reply,), warp::Rejection> { ipfs.pubsub_publish(topic, message.into_inner()) .await .map_err(|e| warp::reject::custom(StringError::from(e)))?; - Ok(warp::reply::reply()) + Ok((warp::reply::reply(),)) } /// Handling of https://docs-beta.ipfs.io/reference/http/api/#api-v0-pubsub-sub @@ -127,11 +106,8 @@ async fn inner_publish( pub fn subscribe( ipfs: &Ipfs, pubsub: Arc, -) -> impl warp::Filter + Clone { - warp::path!("sub") - .and(warp::get().or(warp::post())) - .unify() - .and(with_ipfs(ipfs)) +) -> impl warp::Filter + Clone { + with_ipfs(ipfs) .and(warp::any().map(move || pubsub.clone())) .and(warp::query::()) .and_then(|ipfs, pubsub, TopicParameter { topic }| async move { diff --git a/http/src/v0/refs.rs b/http/src/v0/refs.rs index 6f3695d93..3f7d998c0 100644 --- a/http/src/v0/refs.rs +++ b/http/src/v0/refs.rs @@ -26,11 +26,8 @@ use crate::v0::support::{HandledErr, StreamResponse}; /// https://docs-beta.ipfs.io/reference/http/api/#api-v0-refs pub fn refs( ipfs: &Ipfs, -) -> impl Filter + Clone { - warp::path!("refs") - .and(with_ipfs(ipfs)) - .and(refs_options()) - .and_then(refs_inner) +) -> impl Filter + Clone { + with_ipfs(ipfs).and(refs_options()).and_then(refs_inner) } async fn refs_inner( @@ -588,10 +585,8 @@ fn dagpb_links(ipld: Ipld) -> Vec<(Option, Cid)> { /// Handling of https://docs-beta.ipfs.io/reference/http/api/#api-v0-refs-local pub fn local( ipfs: &Ipfs, -) -> impl Filter + Clone { - warp::path!("refs" / "local") - .and(with_ipfs(ipfs)) - .and_then(inner_local) +) -> impl Filter + Clone { + with_ipfs(ipfs).and_then(inner_local) } async fn inner_local(ipfs: Ipfs) -> Result { diff --git a/http/src/v0/root_files.rs b/http/src/v0/root_files.rs index 7a9b5e4ab..773e3dac0 100644 --- a/http/src/v0/root_files.rs +++ b/http/src/v0/root_files.rs @@ -14,7 +14,7 @@ use serde::Deserialize; use std::convert::TryFrom; use std::fmt; use std::path::Path; -use warp::{path, query, Filter, Rejection, Reply}; +use warp::{query, Filter, Rejection, Reply}; mod tar_helper; use tar_helper::TarHelper; @@ -36,9 +36,8 @@ pub struct AddArgs { pub fn add( ipfs: &Ipfs, -) -> impl Filter + Clone { - path!("add") - .and(with_ipfs(ipfs)) +) -> impl Filter + Clone { + with_ipfs(ipfs) .and(query::()) .and(warp::header::("content-type")) // TODO: rejects if missing .and(warp::body::stream()) @@ -56,11 +55,8 @@ pub struct CatArgs { pub fn cat( ipfs: &Ipfs, -) -> impl Filter + Clone { - path!("cat") - .and(with_ipfs(ipfs)) - .and(query::()) - .and_then(cat_inner) +) -> impl Filter + Clone { + with_ipfs(ipfs).and(query::()).and_then(cat_inner) } async fn cat_inner(ipfs: Ipfs, args: CatArgs) -> Result { @@ -115,11 +111,8 @@ struct GetArgs { pub fn get( ipfs: &Ipfs, -) -> impl Filter + Clone { - path!("get") - .and(with_ipfs(ipfs)) - .and(query::()) - .and_then(get_inner) +) -> impl Filter + Clone { + with_ipfs(ipfs).and(query::()).and_then(get_inner) } async fn get_inner(ipfs: Ipfs, args: GetArgs) -> Result { diff --git a/http/src/v0/support.rs b/http/src/v0/support.rs index d0c54bc4d..2004095b6 100644 --- a/http/src/v0/support.rs +++ b/http/src/v0/support.rs @@ -1,7 +1,6 @@ use ipfs::{Ipfs, IpfsTypes}; use serde::Serialize; use std::borrow::Cow; -use std::convert::Infallible; use std::error::Error as StdError; use std::fmt; @@ -147,7 +146,7 @@ impl StringError { /// Common rejection handling strategy for ipfs http api compatible error responses pub async fn recover_as_message_response( err: warp::reject::Rejection, -) -> Result { +) -> Result { use warp::http::StatusCode; use warp::reject::{InvalidQuery, LengthRequired, MethodNotAllowed}; diff --git a/http/src/v0/swarm.rs b/http/src/v0/swarm.rs index 9303be84f..6d245c51f 100644 --- a/http/src/v0/swarm.rs +++ b/http/src/v0/swarm.rs @@ -23,9 +23,8 @@ async fn connect_query( pub fn connect( ipfs: &Ipfs, -) -> impl Filter + Clone { - warp::path!("swarm" / "connect") - .and(with_ipfs(ipfs)) +) -> impl Filter + Clone { + with_ipfs(ipfs) .and(query::()) .and_then(connect_query) } @@ -89,9 +88,8 @@ async fn peers_query( pub fn peers( ipfs: &Ipfs, -) -> impl Filter + Clone { - warp::path!("swarm" / "peers") - .and(with_ipfs(ipfs)) +) -> impl Filter + Clone { + with_ipfs(ipfs) .and(query::()) .and_then(peers_query) } @@ -120,10 +118,8 @@ async fn addrs_query(ipfs: Ipfs) -> Result( ipfs: &Ipfs, -) -> impl Filter + Clone { - warp::path!("swarm" / "addrs") - .and(with_ipfs(ipfs)) - .and_then(addrs_query) +) -> impl Filter + Clone { + with_ipfs(ipfs).and_then(addrs_query) } #[derive(Debug, Deserialize)] @@ -154,9 +150,8 @@ async fn addrs_local_query( pub fn addrs_local( ipfs: &Ipfs, -) -> impl Filter + Clone { - warp::path!("swarm" / "addrs" / "local") - .and(with_ipfs(ipfs)) +) -> impl Filter + Clone { + with_ipfs(ipfs) .and(query::()) .and_then(addrs_local_query) } @@ -179,9 +174,8 @@ async fn disconnect_query( pub fn disconnect( ipfs: &Ipfs, -) -> impl Filter + Clone { - warp::path!("swarm" / "disconnect") - .and(with_ipfs(ipfs)) +) -> impl Filter + Clone { + with_ipfs(ipfs) .and(query::()) .and_then(disconnect_query) } diff --git a/http/src/v0/version.rs b/http/src/v0/version.rs index 1cccec7e0..bd291e750 100644 --- a/http/src/v0/version.rs +++ b/http/src/v0/version.rs @@ -24,12 +24,14 @@ pub struct Response { // https://docs-beta.ipfs.io/reference/http/api/#api-v0-version // Note: the parameter formatting is only verified, feature looks to be unimplemented for `go-ipfs // 0.4.23` and handled by cli. This is not compatible with `rust-ipfs-api`. -pub async fn version(_query: Query) -> Result { +pub fn version( + _query: Query, +) -> impl std::future::Future> { let response = Response { version: env!("CARGO_PKG_VERSION"), // TODO: move over to rust-ipfs not to worry about syncing version numbers? commit: env!("VERGEN_SHA_SHORT"), repo: "", }; - Ok(warp::reply::json(&response)) + futures::future::ready(Ok((warp::reply::json(&response),))) } From 07c41c0573d5b477cac0a0435c244dbf7af2966e Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Fri, 7 Aug 2020 21:15:48 +0300 Subject: [PATCH 38/57] fix: tests after going from Infallible => Rejection the change was made to allow boxing the v0::routes return value. --- http/src/v0.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/http/src/v0.rs b/http/src/v0.rs index 6b674592d..102b278f9 100644 --- a/http/src/v0.rs +++ b/http/src/v0.rs @@ -156,11 +156,9 @@ async fn not_implemented() -> Result<(impl warp::Reply,), std::convert::Infallib #[cfg(test)] mod tests { - use std::convert::Infallible; - /// Creates routes for tests, the ipfs will not work as no background task is being spawned. async fn testing_routes( - ) -> impl warp::Filter + Clone { + ) -> impl warp::Filter + Clone { use super::routes; use ipfs::{IpfsOptions, UninitializedIpfs}; From 9a8267d5b5870ec4ce9d5b1e01b48d8bba0c22f8 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Sat, 8 Aug 2020 11:52:15 +0300 Subject: [PATCH 39/57] chore: update stack limit notes --- Cargo.lock | 1 + conformance/rust.sh | 31 ++++++++++++++++++++++--------- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1b4633d50..1d01a3d6b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1229,6 +1229,7 @@ dependencies = [ "multihash", "quick-protobuf", "sha2 0.9.1", + "tar", ] [[package]] diff --git a/conformance/rust.sh b/conformance/rust.sh index 86ef14d50..8e2a590ec 100755 --- a/conformance/rust.sh +++ b/conformance/rust.sh @@ -31,15 +31,28 @@ on_killed () { echo ">>>> new execution $$ with args: $@" | tee -a /tmp/rust.log >&2 killed=true -# 256 crashes at p2p swarm init -# 300 at behaviour building -# 350 built the threadpool -# 375 p2p init -# 387 kad init -# 390 ok -# 393 ok -# 400 ok -# 450 ok + +# +# testing around the time of PR #284 +# +# binutils | lld-9 | +# 2.33 | 9.0.0 | notes +# ---------+-------+-------------------------------------- +# 256 | | crashes at id, unlikely inits? +# | 256 | crashes at p2p swarm init +# | 300 | crashes at behaviour building +# | 350 | crashes but built the dns threadpool +# | 375 | crashes at p2p init +# | 387 | crashes at kad init +# | 390 | ok +# | 393 | ok +# | 400 | ok +# | 450 | ok +# 512 | | crashes at id, unlikely inits? +# 1024 | | crashes right away unlikely inits +# 4096 | | still the same +# 8192 | | works without -c unlimited? +# # ulimit -s 8192 -c unlimited ./http "$@" 2>&1 | tee -a /tmp/rust.log || retval=$? killed=false From 25e56fd51d5660618b706d3dec432cb708f81a70 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Sat, 8 Aug 2020 11:52:31 +0300 Subject: [PATCH 40/57] add: missing fmt::Debug impl --- unixfs/Cargo.toml | 1 + unixfs/src/dir/builder/iter.rs | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/unixfs/Cargo.toml b/unixfs/Cargo.toml index 5ffcc4e64..086bfd1fa 100644 --- a/unixfs/Cargo.toml +++ b/unixfs/Cargo.toml @@ -23,3 +23,4 @@ sha2 = { default-features = false, version = "0.9" } hex-literal = { default-features = false, version = "0.3" } libc = { default-features = false, version = "0.2.71" } multibase = { default-features = false, version = "0.8.0" } +tar = { default-features = false, version = "0.4" } diff --git a/unixfs/src/dir/builder/iter.rs b/unixfs/src/dir/builder/iter.rs index e940905dd..a42e8ffb0 100644 --- a/unixfs/src/dir/builder/iter.rs +++ b/unixfs/src/dir/builder/iter.rs @@ -1,6 +1,7 @@ use super::{Entry, Leaf, TreeConstructionFailed, TreeOptions, Visited}; use cid::Cid; use std::collections::{BTreeMap, HashMap}; +use std::fmt; /// Constructs the directory nodes required for a tree. /// @@ -353,6 +354,17 @@ pub struct TreeNode<'a> { pub block: &'a [u8], } +impl<'a> fmt::Debug for TreeNode<'a> { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt.debug_struct("TreeNode") + .field("path", &format_args!("{:?}", self.path)) + .field("cid", &format_args!("{}", self.cid)) + .field("total_size", &self.total_size) + .field("size", &self.block.len()) + .finish() + } +} + impl TreeNode<'_> { /// Convert to an owned and detached representation. pub fn into_owned(self) -> OwnedTreeNode { From dc4ff9cee882c2643f3f79fcef795e591847e92a Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Sat, 8 Aug 2020 11:52:50 +0300 Subject: [PATCH 41/57] add: tar ingestion example, bench copypaste sadly --- Cargo.lock | 250 ++++++++++++++++++++++++++++- unixfs/Cargo.toml | 5 + unixfs/benches/ingest-linux-tar.rs | 118 ++++++++++++++ unixfs/examples/ingest-tar.rs | 177 ++++++++++++++++++++ 4 files changed, 546 insertions(+), 4 deletions(-) create mode 100644 unixfs/benches/ingest-linux-tar.rs create mode 100644 unixfs/examples/ingest-tar.rs diff --git a/Cargo.lock b/Cargo.lock index 1d01a3d6b..5cc832a71 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -184,6 +184,17 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "065374052e7df7ee4047b1160cca5e1467a12351a40b3da123c870ba0b8eda2a" +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi", + "libc", + "winapi 0.3.9", +] + [[package]] name = "autocfg" version = "1.0.0" @@ -314,6 +325,18 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "476e9cd489f9e121e02ffa6014a8ef220ecb15c05ed23fc34cca13925dc283fb" +[[package]] +name = "bstr" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31accafdb70df7871592c058eca3985b71104e15ac32f64706022c58867da931" +dependencies = [ + "lazy_static", + "memchr", + "regex-automata", + "serde", +] + [[package]] name = "bumpalo" version = "3.4.0" @@ -366,6 +389,15 @@ version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "631ae5198c9be5e753e5cc215e1bd73c2b466a3565173db433f52bb9d3e66dba" +[[package]] +name = "cast" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b9434b9a5aa1450faa3f9cb14ea0e8c53bb5d2b3c1bfd1ab4fc03e9f33fbfb0" +dependencies = [ + "rustc_version", +] + [[package]] name = "cc" version = "1.0.58" @@ -441,6 +473,79 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8aebca1129a03dc6dc2b127edd729435bbc4a37e1d5f4d7513165089ceb02634" +[[package]] +name = "criterion" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70daa7ceec6cf143990669a04c7df13391d55fb27bd4079d252fca774ba244d8" +dependencies = [ + "atty", + "cast", + "clap", + "criterion-plot", + "csv", + "itertools 0.9.0", + "lazy_static", + "num-traits", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_cbor", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e022feadec601fba1649cfa83586381a4ad31c6bf3a9ab7d408118b05dd9889d" +dependencies = [ + "cast", + "itertools 0.9.0", +] + +[[package]] +name = "crossbeam-deque" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f02af974daeee82218205558e51ec8768b48cf524bd01d550abe5573a608285" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", + "maybe-uninit", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "058ed274caafc1f60c4997b5fc07bf7dc7cca454af7c6e81edffe5f33f70dace" +dependencies = [ + "autocfg", + "cfg-if", + "crossbeam-utils", + "lazy_static", + "maybe-uninit", + "memoffset", + "scopeguard", +] + +[[package]] +name = "crossbeam-queue" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "774ba60a54c213d409d5353bda12d49cd68d14e45036a285234c8d6f91f92570" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "maybe-uninit", +] + [[package]] name = "crossbeam-utils" version = "0.7.2" @@ -468,6 +573,28 @@ dependencies = [ "subtle 1.0.0", ] +[[package]] +name = "csv" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00affe7f6ab566df61b4be3ce8cf16bc2576bca0963ceb0955e45d514bf9a279" +dependencies = [ + "bstr", + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" +dependencies = [ + "memchr", +] + [[package]] name = "ctr" version = "0.3.2" @@ -972,6 +1099,12 @@ dependencies = [ "tracing", ] +[[package]] +name = "half" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d36fab90f82edc3c747f9d438e06cf0a491055896f2a279638bb5beed6c40177" + [[package]] name = "hashbrown" version = "0.8.1" @@ -1221,6 +1354,7 @@ name = "ipfs-unixfs" version = "0.0.1" dependencies = [ "cid", + "criterion", "either", "filetime", "hex-literal", @@ -1247,6 +1381,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "284f18f85651fe11e8a991b2adb42cb078325c996ed026d994719efcfca1d54b" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "0.4.6" @@ -1661,6 +1804,15 @@ version = "2.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3728d817d99e5ac407411fa471ff9800a778d88a24685968b36824eaf4bee400" +[[package]] +name = "memoffset" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c198b026e1bbf08a937e94c6c60f9ec4a2267f5b0d2eec9c1b21b061ce2be55f" +dependencies = [ + "autocfg", +] + [[package]] name = "mime" version = "0.3.16" @@ -1856,6 +2008,12 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b631f7e854af39a1739f401cf34a8a013dfe09eac4fa4dba91e9768bd28168d" +[[package]] +name = "oorandom" +version = "11.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a170cebd8021a008ea92e4db85a72f80b35df514ec664b296fdcbb654eac0b2c" + [[package]] name = "opaque-debug" version = "0.2.3" @@ -2029,6 +2187,18 @@ version = "0.3.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d36492546b6af1463394d46f0c834346f31548646f6ba10849802c9c9a27ac33" +[[package]] +name = "plotters" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d1685fbe7beba33de0330629da9d955ac75bd54f33d7b79f9a895590124f6bb" +dependencies = [ + "js-sys", + "num-traits", + "wasm-bindgen", + "web-sys", +] + [[package]] name = "ppv-lite86" version = "0.2.8" @@ -2098,7 +2268,7 @@ checksum = "02b10678c913ecbd69350e8535c3aef91a8676c0773fc1d7b95cdd196d7f2f26" dependencies = [ "bytes 0.5.6", "heck", - "itertools", + "itertools 0.8.2", "log", "multimap", "petgraph", @@ -2115,7 +2285,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "537aa19b95acde10a12fec4301466386f757403de4cd4e5b4fa78fb5ecb18f72" dependencies = [ "anyhow", - "itertools", + "itertools 0.8.2", "proc-macro2", "quote", "syn", @@ -2140,8 +2310,6 @@ checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" [[package]] name = "quick-protobuf" version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e489d4a83c17ea69b0291630229b5d4c92a94a3bf0165f7f72f506e94cda8b4b" dependencies = [ "byteorder 1.3.4", ] @@ -2245,6 +2413,31 @@ dependencies = [ "rand_core 0.5.1", ] +[[package]] +name = "rayon" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62f02856753d04e03e26929f820d0a0a337ebe71f849801eea335d464b349080" +dependencies = [ + "autocfg", + "crossbeam-deque", + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e92e15d89083484e11353891f1af602cc661426deb9564c298b270c726973280" +dependencies = [ + "crossbeam-deque", + "crossbeam-queue", + "crossbeam-utils", + "lazy_static", + "num_cpus", +] + [[package]] name = "rdrand" version = "0.4.0" @@ -2366,6 +2559,15 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + [[package]] name = "scoped-tls" version = "1.0.0" @@ -2408,6 +2610,16 @@ dependencies = [ "serde_derive", ] +[[package]] +name = "serde_cbor" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e18acfa2f90e8b735b2836ab8d538de304cbb6729a7360729ea5a895d15a622" +dependencies = [ + "half", + "serde", +] + [[package]] name = "serde_derive" version = "1.0.114" @@ -2724,6 +2936,16 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "tinytemplate" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d3dc76004a03cec1c5932bca4cdc2e39aaa798e3f82363dd94f9adf6098c12f" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "tinyvec" version = "0.3.3" @@ -3023,6 +3245,17 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9571542c2ce85ce642e6b58b3364da2fb53526360dfb7c211add4f5c23105ff7" +[[package]] +name = "walkdir" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "777182bc735b6424e1a57516d35ed72cb8019d85c8c9bf536dccb3445c1a2f7d" +dependencies = [ + "same-file", + "winapi 0.3.9", + "winapi-util", +] + [[package]] name = "want" version = "0.3.0" @@ -3203,6 +3436,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" +[[package]] +name = "winapi-util" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +dependencies = [ + "winapi 0.3.9", +] + [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" diff --git a/unixfs/Cargo.toml b/unixfs/Cargo.toml index 086bfd1fa..cc49a57d6 100644 --- a/unixfs/Cargo.toml +++ b/unixfs/Cargo.toml @@ -24,3 +24,8 @@ hex-literal = { default-features = false, version = "0.3" } libc = { default-features = false, version = "0.2.71" } multibase = { default-features = false, version = "0.8.0" } tar = { default-features = false, version = "0.4" } +criterion = "0.3" + +[[bench]] +name = "ingest-linux-tar" +harness = false diff --git a/unixfs/benches/ingest-linux-tar.rs b/unixfs/benches/ingest-linux-tar.rs new file mode 100644 index 000000000..c55bb12de --- /dev/null +++ b/unixfs/benches/ingest-linux-tar.rs @@ -0,0 +1,118 @@ +use criterion::{black_box, criterion_group, criterion_main, Criterion}; + +pub fn criterion_benchmark(c: &mut Criterion) { + let tar_bytes = std::fs::read("linux-5.6.tar").expect("read failed"); + c.bench_function("ingest-linux-tar", |b| b.iter(|| ingest_tar(&tar_bytes))); +} + +fn ingest_tar(bytes: &[u8]) { + use cid::Cid; + use ipfs_unixfs::dir::builder::{BufferingTreeBuilder, TreeOptions}; + use ipfs_unixfs::file::adder::FileAdder; + use std::io::Read; + + let mut buffer = Vec::new(); + + let mut archive = tar::Archive::new(std::io::Cursor::new(bytes)); + let mut entries = archive.entries().unwrap(); + + let mut opts = TreeOptions::default(); + opts.wrap_with_directory(); + let mut tree = BufferingTreeBuilder::new(opts); + + while let Some(entry) = entries.next() { + let mut entry = entry.unwrap(); + let path = std::str::from_utf8(&*entry.path_bytes()) + .unwrap() + .to_string(); // need to get rid of this + + if let Some(_link_name) = entry.link_name_bytes() { + continue; + } + + if !path.ends_with('/') { + let mut adder = FileAdder::default(); + + // with the std::io::Read it'd be good to read into the fileadder, or read into ... + // something. trying to acccess the buffer from in side FileAdder does not seem the be the + // way to go. + + if let Some(needed) = adder.size_hint().checked_sub(buffer.capacity()) { + buffer.reserve(needed); + } + + if let Some(mut needed) = adder.size_hint().checked_sub(buffer.len()) { + let zeros = [0u8; 64]; + + while needed > zeros.len() { + buffer.extend_from_slice(&zeros[..]); + needed -= zeros.len(); + } + + buffer.extend(std::iter::repeat(0).take(needed)); + } + + let mut total_written = 0usize; + + loop { + match entry.read(&mut buffer[0..]).unwrap() { + 0 => { + let blocks = adder.finish(); + let (cid, subtotal) = blocks + .fold( + None, + |acc: Option<(Cid, usize)>, (cid, bytes): (Cid, Vec)| match acc + { + Some((_, total)) => Some((cid, total + bytes.len())), + None => Some((cid, bytes.len())), + }, + ) + .expect("this is probably always present"); + + total_written += subtotal; + + tree.put_file(&path, cid, total_written as u64).unwrap(); + break; + } + n => { + let mut read = 0; + while read < n { + let (blocks, consumed) = adder.push(&buffer[read..n]); + read += consumed; + total_written += blocks.map(|(_, bytes)| bytes.len()).sum::(); + } + } + } + } + } else { + tree.set_metadata(&path[..path.len() - 1], ipfs_unixfs::Metadata::default()) + .unwrap(); + } + } + + let mut iter = tree.build(); + + let mut last: Option<(Cid, u64, usize)> = None; + + while let Some(res) = iter.next_borrowed() { + let res = res.unwrap(); + + match &mut last { + Some(ref mut s) => { + s.0 = res.cid.to_owned(); + s.1 = res.total_size; + s.2 = res.block.len(); + } + n @ None => { + *n = Some((res.cid.to_owned(), res.total_size, res.block.len())); + } + } + } + + let last = last.unwrap(); + + black_box(last); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/unixfs/examples/ingest-tar.rs b/unixfs/examples/ingest-tar.rs new file mode 100644 index 000000000..14f2e08ca --- /dev/null +++ b/unixfs/examples/ingest-tar.rs @@ -0,0 +1,177 @@ +use cid::Cid; +use ipfs_unixfs::dir::builder::{BufferingTreeBuilder, TreeOptions}; +use ipfs_unixfs::file::adder::FileAdder; +use std::fmt; +use std::io::Read; +use std::time::{Duration, Instant}; + +fn main() { + let started = Instant::now(); + + let stdin = std::io::stdin(); + let stdin = stdin.lock(); + + let mut archive = tar::Archive::new(stdin); + let mut entries = archive.entries().unwrap(); + + let mut buffer = Vec::new(); + + let mut opts = TreeOptions::default(); + opts.wrap_with_directory(); + let mut tree = BufferingTreeBuilder::new(opts); + + while let Some(entry) = entries.next() { + let mut entry = entry.unwrap(); + let path = std::str::from_utf8(&*entry.path_bytes()) + .unwrap() + .to_string(); // need to get rid of this + + if let Some(_link_name) = entry.link_name_bytes() { + continue; + } + + if !path.ends_with('/') { + let mut adder = FileAdder::default(); + + // with the std::io::Read it'd be good to read into the fileadder, or read into ... + // something. trying to acccess the buffer from in side FileAdder does not seem the be the + // way to go. + + if let Some(needed) = adder.size_hint().checked_sub(buffer.capacity()) { + buffer.reserve(needed); + } + + if let Some(mut needed) = adder.size_hint().checked_sub(buffer.len()) { + let zeros = [0u8; 64]; + + while needed > zeros.len() { + buffer.extend_from_slice(&zeros[..]); + needed -= zeros.len(); + } + + buffer.extend(std::iter::repeat(0).take(needed)); + } + + let mut total_written = 0usize; + + loop { + match entry.read(&mut buffer[0..]).unwrap() { + 0 => { + let blocks = adder.finish(); + let (cid, subtotal) = blocks + .fold( + None, + |acc: Option<(Cid, usize)>, (cid, bytes): (Cid, Vec)| match acc + { + Some((_, total)) => Some((cid, total + bytes.len())), + None => Some((cid, bytes.len())), + }, + ) + .expect("this is probably always present"); + + total_written += subtotal; + + tree.put_file(&path, cid, total_written as u64).unwrap(); + break; + } + n => { + let mut read = 0; + while read < n { + let (blocks, consumed) = adder.push(&buffer[read..n]); + read += consumed; + total_written += blocks.map(|(_, bytes)| bytes.len()).sum::(); + } + } + } + } + } else { + tree.set_metadata(&path[..path.len() - 1], ipfs_unixfs::Metadata::default()) + .unwrap(); + } + } + + let mut iter = tree.build(); + + let mut last: Option<(Cid, u64, usize)> = None; + + while let Some(res) = iter.next_borrowed() { + let res = res.unwrap(); + + match &mut last { + Some(ref mut s) => { + s.0 = res.cid.to_owned(); + s.1 = res.total_size; + s.2 = res.block.len(); + } + n @ None => { + *n = Some((res.cid.to_owned(), res.total_size, res.block.len())); + } + } + } + + let last = last.unwrap(); + + println!("{} ({} bytes), total: {} bytes", last.0, last.2, last.1); + + let process_stats = get_process_stats(started); + + match process_stats { + Ok(all) => eprintln!("{}", all), + Err(wall) => eprintln!("wall_time: {:?}", wall), + } +} + +struct ProcessStats { + user_time: Duration, + system_time: Duration, + max_rss: i64, + wall_time: Duration, +} + +impl fmt::Display for ProcessStats { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + fmt, + "Max RSS: {} KB, utime: {:?}, stime: {:?}, total: {:?}, wall_time: {:?}", + self.max_rss, + self.user_time, + self.system_time, + self.user_time + self.system_time, + self.wall_time, + ) + } +} + +#[cfg(unix)] +fn get_process_stats(started_at: Instant) -> Result { + fn to_duration(tv: libc::timeval) -> Duration { + assert!(tv.tv_sec >= 0); + Duration::new(tv.tv_sec as u64, tv.tv_usec as u32) + } + + let (max_rss, user_time, system_time) = unsafe { + let mut rusage: libc::rusage = std::mem::zeroed(); + + let retval = libc::getrusage(libc::RUSAGE_SELF, &mut rusage as *mut _); + + assert_eq!(retval, 0); + + (rusage.ru_maxrss, rusage.ru_utime, rusage.ru_stime) + }; + + let user_time = to_duration(user_time); + let system_time = to_duration(system_time); + let wall_time = started_at.elapsed(); + + Ok(ProcessStats { + user_time, + system_time, + max_rss, + wall_time, + }) +} + +#[cfg(not(unix))] +fn get_process_stats(started_at: Instant) -> Result { + Err(started_at.elapsed()) +} From bcab2bd3b1d29d907bd6f0e5b0c72e0f00b2b8d0 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Sun, 9 Aug 2020 15:23:12 +0300 Subject: [PATCH 42/57] refactor: move Visited under iter.rs --- unixfs/src/dir/builder.rs | 16 ---------------- unixfs/src/dir/builder/buffered.rs | 11 ++--------- unixfs/src/dir/builder/iter.rs | 26 +++++++++++++++++++++++--- 3 files changed, 25 insertions(+), 28 deletions(-) diff --git a/unixfs/src/dir/builder.rs b/unixfs/src/dir/builder.rs index 240ae0493..717748c1a 100644 --- a/unixfs/src/dir/builder.rs +++ b/unixfs/src/dir/builder.rs @@ -121,22 +121,6 @@ impl fmt::Display for TreeBuildingFailed { impl std::error::Error for TreeBuildingFailed {} -#[derive(Debug)] -enum Visited { - Descent { - node: DirBuilder, - name: Option, - depth: usize, - }, - Post { - parent_id: Option, - id: u64, - name: Option, - depth: usize, - leaves: Vec<(String, Leaf)>, - }, -} - /// Failure cases for `PostOrderIterator` creating the tree dag-pb nodes. #[derive(Debug)] pub enum TreeConstructionFailed { diff --git a/unixfs/src/dir/builder/buffered.rs b/unixfs/src/dir/builder/buffered.rs index 095293a62..824f5c769 100644 --- a/unixfs/src/dir/builder/buffered.rs +++ b/unixfs/src/dir/builder/buffered.rs @@ -1,4 +1,4 @@ -use super::{DirBuilder, Entry, Leaf, PostOrderIterator, TreeBuildingFailed, TreeOptions, Visited}; +use super::{DirBuilder, Entry, Leaf, PostOrderIterator, TreeBuildingFailed, TreeOptions}; use crate::Metadata; use cid::Cid; use std::collections::hash_map::Entry::*; @@ -180,14 +180,7 @@ impl BufferingTreeBuilder { /// its data during the walk. `PostOrderIterator` implements `Iterator` while also allowing /// borrowed access via `next_borrowed`. pub fn build(self) -> PostOrderIterator { - PostOrderIterator::new( - Visited::Descent { - node: self.root_builder, - name: None, - depth: 0, - }, - self.opts, - ) + PostOrderIterator::new(self.root_builder, self.opts) } } diff --git a/unixfs/src/dir/builder/iter.rs b/unixfs/src/dir/builder/iter.rs index a42e8ffb0..4b795224d 100644 --- a/unixfs/src/dir/builder/iter.rs +++ b/unixfs/src/dir/builder/iter.rs @@ -1,4 +1,4 @@ -use super::{Entry, Leaf, TreeConstructionFailed, TreeOptions, Visited}; +use super::{DirBuilder, Entry, Leaf, TreeConstructionFailed, TreeOptions}; use cid::Cid; use std::collections::{BTreeMap, HashMap}; use std::fmt; @@ -22,13 +22,33 @@ pub struct PostOrderIterator { opts: TreeOptions, } +#[derive(Debug)] +enum Visited { + Descent { + node: DirBuilder, + name: Option, + depth: usize, + }, + Post { + parent_id: Option, + id: u64, + name: Option, + depth: usize, + leaves: Vec<(String, Leaf)>, + }, +} + impl PostOrderIterator { - pub(super) fn new(root: Visited, opts: TreeOptions) -> Self { + pub(super) fn new(root: DirBuilder, opts: TreeOptions) -> Self { PostOrderIterator { full_path: Default::default(), old_depth: 0, block_buffer: Default::default(), - pending: vec![root], + pending: vec![Visited::Descent { + node: root, + name: None, + depth: 0, + }], persisted_cids: Default::default(), reused_children: Vec::new(), cid: None, From eb58b8b15edb1dec8d3019409e165cfd3848d767 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Sun, 9 Aug 2020 15:24:11 +0300 Subject: [PATCH 43/57] test: output "actual" on test failure --- unixfs/src/dir/builder/buffered.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/unixfs/src/dir/builder/buffered.rs b/unixfs/src/dir/builder/buffered.rs index 824f5c769..dd63bcd1a 100644 --- a/unixfs/src/dir/builder/buffered.rs +++ b/unixfs/src/dir/builder/buffered.rs @@ -398,7 +398,10 @@ mod tests { } fn verify_results( - mut expected: Vec<(impl AsRef, impl AsRef)>, + mut expected: Vec<( + impl AsRef + std::fmt::Debug, + impl AsRef + std::fmt::Debug, + )>, mut actual: Vec<(String, Cid, Box<[u8]>)>, ) { use std::fmt; @@ -431,7 +434,7 @@ mod tests { ); } - assert_eq!(expected.len(), 0); + assert_eq!(expected.len(), 0, "size mismatch: {:?}", actual); } /// Returns a quick and dirty sha2-256 of the given number as a Cidv0 From 19a62e23dc2e5f2d4b70622bd483c8b3b4deb952 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Sun, 9 Aug 2020 15:28:28 +0300 Subject: [PATCH 44/57] refactor: use BTreeMap in dir_builder this allows us to get the unique and sorted links right away, allowing to use Vec> later. --- unixfs/src/dir/builder/buffered.rs | 2 +- unixfs/src/dir/builder/dir_builder.rs | 10 +- unixfs/src/dir/builder/iter.rs | 335 ++++++++++++++++++-------- 3 files changed, 244 insertions(+), 103 deletions(-) diff --git a/unixfs/src/dir/builder/buffered.rs b/unixfs/src/dir/builder/buffered.rs index dd63bcd1a..ef594c8a1 100644 --- a/unixfs/src/dir/builder/buffered.rs +++ b/unixfs/src/dir/builder/buffered.rs @@ -1,7 +1,7 @@ use super::{DirBuilder, Entry, Leaf, PostOrderIterator, TreeBuildingFailed, TreeOptions}; use crate::Metadata; use cid::Cid; -use std::collections::hash_map::Entry::*; +use std::collections::btree_map::Entry::*; /// UnixFs directory tree builder which buffers entries until `build()` is called. #[derive(Debug)] diff --git a/unixfs/src/dir/builder/dir_builder.rs b/unixfs/src/dir/builder/dir_builder.rs index 94a341e53..486deecec 100644 --- a/unixfs/src/dir/builder/dir_builder.rs +++ b/unixfs/src/dir/builder/dir_builder.rs @@ -1,7 +1,7 @@ use super::{Entry, Leaf}; use crate::Metadata; -use std::collections::hash_map::Entry::*; -use std::collections::HashMap; +use std::collections::btree_map::Entry::*; +use std::collections::BTreeMap; pub(super) struct DuplicateName; pub(super) struct FoundLeaf; @@ -10,7 +10,7 @@ pub(super) struct FoundLeaf; #[derive(Debug)] pub(super) struct DirBuilder { /// Immediate files, symlinks or directories in this directory - pub nodes: HashMap, + pub nodes: BTreeMap, /// Metadata for this directory metadata: Metadata, /// Id of the parent; None for the root node @@ -23,7 +23,7 @@ impl DirBuilder { pub fn new(parent_id: u64, id: u64) -> Self { assert_ne!(parent_id, id); DirBuilder { - nodes: HashMap::new(), + nodes: Default::default(), metadata: Default::default(), parent_id: Some(parent_id), id, @@ -32,7 +32,7 @@ impl DirBuilder { pub fn root(id: u64) -> Self { DirBuilder { - nodes: HashMap::new(), + nodes: Default::default(), metadata: Default::default(), parent_id: None, id, diff --git a/unixfs/src/dir/builder/iter.rs b/unixfs/src/dir/builder/iter.rs index 4b795224d..932690db8 100644 --- a/unixfs/src/dir/builder/iter.rs +++ b/unixfs/src/dir/builder/iter.rs @@ -1,6 +1,6 @@ use super::{DirBuilder, Entry, Leaf, TreeConstructionFailed, TreeOptions}; use cid::Cid; -use std::collections::{BTreeMap, HashMap}; +use std::collections::HashMap; use std::fmt; /// Constructs the directory nodes required for a tree. @@ -13,8 +13,9 @@ pub struct PostOrderIterator { block_buffer: Vec, // our stack of pending work pending: Vec, - // "communication channel" from nested entries back to their parents - persisted_cids: HashMap, BTreeMap>, + // "communication channel" from nested entries back to their parents this hashmap is only used + // in the event of mixed child nodes (leaves and nodes). + persisted_cids: HashMap>>, reused_children: Vec, cid: Option, total_size: u64, @@ -22,33 +23,41 @@ pub struct PostOrderIterator { opts: TreeOptions, } +type Leaves = Vec>; + +#[derive(Debug)] +struct NamedLeaf(String, Cid, u64); + #[derive(Debug)] enum Visited { + // handle root differently not to infect everything with the Option and so on + DescentRoot(DirBuilder), Descent { node: DirBuilder, - name: Option, + name: String, depth: usize, + index: usize, }, Post { - parent_id: Option, - id: u64, - name: Option, + parent_id: u64, depth: usize, - leaves: Vec<(String, Leaf)>, + name: String, + index: usize, + leaves: LeafStorage, + }, + PostRoot { + leaves: LeafStorage, }, } impl PostOrderIterator { pub(super) fn new(root: DirBuilder, opts: TreeOptions) -> Self { + let root = Visited::DescentRoot(root); PostOrderIterator { full_path: Default::default(), old_depth: 0, block_buffer: Default::default(), - pending: vec![Visited::Descent { - node: root, - name: None, - depth: 0, - }], + pending: vec![root], persisted_cids: Default::default(), reused_children: Vec::new(), cid: None, @@ -58,7 +67,7 @@ impl PostOrderIterator { } fn render_directory( - links: &BTreeMap, + links: &[Option], buffer: &mut Vec, block_size_limit: &Option, ) -> Result { @@ -88,12 +97,13 @@ impl PostOrderIterator { use cid::Version::*; use quick_protobuf::sizeofs::*; + let hash_len = self.0.hash().as_bytes().len(); + match self.0.version() { - V0 => self.0.hash().as_bytes().len(), + V0 => hash_len, V1 => { let version_len = 1; let codec_len = sizeof_varint(u64::from(self.0.codec())); - let hash_len = self.0.hash().as_bytes().len(); version_len + codec_len + hash_len } } @@ -106,21 +116,22 @@ impl PostOrderIterator { use cid::Version::*; match self.0.version() { - V0 => { - for b in self.0.hash().as_bytes() { - w.write_u8(*b)?; - } - Ok(()) - } + V0 => self + .0 + .hash() + .as_bytes() + .iter() + .try_for_each(|b| w.write_u8(*b)), V1 => { // it is possible that Cidv1 should not be linked to from a unixfs // directory; at least go-ipfs 0.5 `ipfs files` denies making a cbor link w.write_u8(1)?; w.write_varint(u64::from(self.0.codec()))?; - for b in self.0.hash().as_bytes() { - w.write_u8(*b)?; - } - Ok(()) + self.0 + .hash() + .as_bytes() + .iter() + .try_for_each(|b| w.write_u8(*b)) } } } @@ -128,47 +139,53 @@ impl PostOrderIterator { /// Newtype which uses the BTreeMap as Vec. struct BTreeMappedDir<'a> { - links: &'a BTreeMap, + links: &'a [Option], data: UnixFs<'a>, } /// Newtype which represents an entry from BTreeMap as PBLink as far as the /// protobuf representation goes. - struct EntryAsPBLink<'a>(&'a String, &'a Leaf); + struct EntryAsPBLink<'a>(&'a NamedLeaf); impl<'a> MessageWrite for EntryAsPBLink<'a> { fn get_size(&self) -> usize { use quick_protobuf::sizeofs::*; // ones are the tags - 1 + sizeof_len(self.0.len()) + 1 + sizeof_len((self.0).0.len()) + 1 - //+ sizeof_len(WriteableCid(&self.1.link).get_size()) - + sizeof_len(self.1.link.to_bytes().len()) + + sizeof_len(WriteableCid(&(self.0).1).get_size()) + //+ sizeof_len(self.1.link.to_bytes().len()) + 1 - + sizeof_varint(self.1.total_size) + + sizeof_varint((self.0).2) } fn write_message( &self, w: &mut Writer, ) -> quick_protobuf::Result<()> { - // w.write_with_tag(10, |w| w.write_message(&WriteableCid(&self.1.link)))?; - w.write_with_tag(10, |w| w.write_bytes(&self.1.link.to_bytes()))?; - w.write_with_tag(18, |w| w.write_string(self.0.as_str()))?; - w.write_with_tag(24, |w| w.write_uint64(self.1.total_size))?; + w.write_with_tag(10, |w| w.write_message(&WriteableCid(&(self.0).1)))?; + //w.write_with_tag(10, |w| w.write_bytes(&self.1.link.to_bytes()))?; + w.write_with_tag(18, |w| w.write_string((self.0).0.as_str()))?; + w.write_with_tag(24, |w| w.write_uint64((self.0).2))?; Ok(()) } } + impl<'a> BTreeMappedDir<'a> { + fn mapped(&self) -> impl Iterator> + '_ { + self.links + .iter() + .map(|triple| triple.as_ref().map(|l| EntryAsPBLink(l)).unwrap()) + } + } + impl<'a> MessageWrite for BTreeMappedDir<'a> { fn get_size(&self) -> usize { use quick_protobuf::sizeofs::*; let links = self - .links - .iter() - .map(|(k, v)| EntryAsPBLink(k, v)) + .mapped() .map(|link| 1 + sizeof_len(link.get_size())) .sum::(); @@ -178,7 +195,7 @@ impl PostOrderIterator { &self, w: &mut Writer, ) -> quick_protobuf::Result<()> { - for l in self.links.iter().map(|(k, v)| EntryAsPBLink(k, v)) { + for l in self.mapped() { w.write_with_tag(18, |w| w.write_message(&l))?; } w.write_with_tag(10, |w| w.write_message(&self.data)) @@ -193,6 +210,25 @@ impl PostOrderIterator { }, }; + /* + + use crate::pb::{FlatUnixFs, PBLink}; + use std::borrow::Cow; + let btreed = FlatUnixFs { + links: links + .iter() // .drain() would be the most reasonable + .map(|(name, Leaf { link, total_size })| PBLink { + Hash: Some(link.to_bytes().into()), + Name: Some(Cow::Borrowed(name.as_str())), + Tsize: Some(*total_size), + }) + .collect::>(), + data: UnixFs { + Type: UnixFsType::Directory, + ..Default::default() + }, + };*/ + /**/ let size = btreed.get_size(); if let Some(limit) = block_size_limit { @@ -227,8 +263,12 @@ impl PostOrderIterator { let cid = Cid::new_v0(mh).expect("sha2_256 is the correct multihash for cidv0"); let combined_from_links = links - .values() - .map(|Leaf { total_size, .. }| total_size) + .iter() + .map(|opt| { + opt.as_ref() + .map(|NamedLeaf(_, _, total_size)| total_size) + .unwrap() + }) .sum::(); Ok(Leaf { @@ -243,78 +283,81 @@ impl PostOrderIterator { pub fn next_borrowed(&mut self) -> Option, TreeConstructionFailed>> { while let Some(visited) = self.pending.pop() { let (name, depth) = match &visited { - Visited::Descent { name, depth, .. } => (name.as_deref(), *depth), - Visited::Post { name, depth, .. } => (name.as_deref(), *depth), + Visited::DescentRoot(_) => (None, 0), + Visited::Descent { name, depth, .. } => (Some(name.as_ref()), *depth), + Visited::Post { name, depth, .. } => (Some(name.as_ref()), *depth), + Visited::PostRoot { .. } => (None, 0), }; update_full_path((&mut self.full_path, &mut self.old_depth), name, depth); match visited { - Visited::Descent { node, name, depth } => { - let mut leaves = Vec::new(); + Visited::DescentRoot(node) => { + let children = &mut self.reused_children; + let leaves = partition_children_leaves(depth, node.nodes.into_iter(), children); + + // initial idea was to validate something with + + let any_children = !children.is_empty(); + + let leaves = if any_children { + // we only need to put the leaves in there in the case of wrapping + self.persisted_cids.insert(node.id, leaves); + LeafStorage::from(node.id) + } else { + leaves.into() + }; + + self.pending.push(Visited::PostRoot { leaves }); + self.pending.extend(children.drain(..)); + } + Visited::Descent { + node, + name, + depth, + index, + } => { let children = &mut self.reused_children; - for (k, v) in node.nodes { - match v { - Entry::Directory(node) => children.push(Visited::Descent { - node, - name: Some(k), - depth: depth + 1, - }), - Entry::Leaf(leaf) => leaves.push((k, leaf)), - } - } + let leaves = partition_children_leaves(depth, node.nodes.into_iter(), children); + + let any_children = !children.is_empty(); + + // this would be none for only the single first node, however we know already + // this is not the branch DescentRoot + let parent_id = node.parent_id.expect("this is not root"); + + let leaves = if any_children { + self.persisted_cids.insert(node.id, leaves); + node.id.into() + } else { + leaves.into() + }; self.pending.push(Visited::Post { - parent_id: node.parent_id, - id: node.id, + parent_id, name, depth, leaves, + index, }); - let any_children = !children.is_empty(); - self.pending.extend(children.drain(..)); - - if any_children { - // we could strive to do everything right now but pushing and popping might - // turn out easier code wise, or in other words, when there are no child_nodes - // we wouldn't need to go through Visited::Post. - } } Visited::Post { parent_id, - id, name, leaves, + index, .. } => { - // all of our children have now been visited; we should be able to find their - // Cids in the btreemap - let mut collected = self.persisted_cids.remove(&Some(id)).unwrap_or_default(); - - // FIXME: leaves could be drained and reused - collected.extend(leaves); - - if !self.opts.wrap_with_directory && parent_id.is_none() { - // we aren't supposed to wrap_with_directory, and we are now looking at the - // possibly to-be-generated root directory. - - assert_eq!( - collected.len(), - 1, - "should not have gone this far with multiple added roots" - ); - - return None; - } + let leaves = leaves.into_inner(&mut self.persisted_cids); let buffer = &mut self.block_buffer; let leaf = match Self::render_directory( - &collected, + &leaves, buffer, &self.opts.block_size_limit, ) { @@ -325,21 +368,53 @@ impl PostOrderIterator { self.cid = Some(leaf.link.clone()); self.total_size = leaf.total_size; - // this reuse strategy is probably good enough - collected.clear(); - - if let Some(name) = name { + { // name is None only for wrap_with_directory, which cannot really be // propagated up but still the parent_id is allowed to be None - let previous = self - .persisted_cids - .entry(parent_id) - .or_insert(collected) - .insert(name, leaf); + let parent_leaves = self.persisted_cids.get_mut(&parent_id); + + match (parent_id, parent_leaves, index) { + (pid, None, index) => panic!( + "leaves not found for parent_id = {} and index = {}", + pid, index + ), + (_, Some(vec), index) => { + let cell = &mut vec[index]; + // all + assert!(cell.is_none()); + *cell = Some(NamedLeaf(name, leaf.link, leaf.total_size)); + } + } + } + + return Some(Ok(TreeNode { + path: self.full_path.as_str(), + cid: self.cid.as_ref().unwrap(), + total_size: self.total_size, + block: &self.block_buffer, + })); + } + Visited::PostRoot { leaves } => { + let leaves = leaves.into_inner(&mut self.persisted_cids); - assert!(previous.is_none()); + if !self.opts.wrap_with_directory { + break; } + let buffer = &mut self.block_buffer; + + let leaf = match Self::render_directory( + &leaves, + buffer, + &self.opts.block_size_limit, + ) { + Ok(leaf) => leaf, + Err(e) => return Some(Err(e)), + }; + + self.cid = Some(leaf.link.clone()); + self.total_size = leaf.total_size; + return Some(Ok(TreeNode { path: self.full_path.as_str(), cid: self.cid.as_ref().unwrap(), @@ -425,6 +500,11 @@ fn update_full_path( // this would be easier with PathBuf let slash_at = full_path.bytes().rposition(|ch| ch == b'/'); if let Some(slash_at) = slash_at { + if *old_depth == depth && Some(&full_path[(slash_at + 1)..]) == name { + // minor unmeasurable perf optimization: + // going from a/b/foo/zz => a/b/foo does not need to go through the a/b + return; + } full_path.truncate(slash_at); *old_depth -= 1; } else { @@ -449,4 +529,65 @@ fn update_full_path( } assert_eq!(*old_depth, depth); + // eprintln!("{:>4} {:?}", depth, full_path); +} + +fn partition_children_leaves( + depth: usize, + it: impl Iterator, + children: &mut Vec, +) -> Leaves { + let mut leaves = Vec::new(); + + for (i, (k, v)) in it.enumerate() { + match v { + Entry::Directory(node) => { + children.push(Visited::Descent { + node, + // this needs to be pushed down to update the full_path + name: k, + depth: depth + 1, + index: i, + }); + + // this will be overwritten later, but the order is fixed + leaves.push(None); + } + Entry::Leaf(leaf) => leaves.push(Some(NamedLeaf(k, leaf.link, leaf.total_size))), + } + } + + leaves +} + +#[derive(Debug)] +enum LeafStorage { + Direct(Leaves), + Stashed(u64), +} + +impl LeafStorage { + fn into_inner(self, stash: &mut HashMap) -> Leaves { + use LeafStorage::*; + + match self { + Direct(leaves) => leaves, + Stashed(id) => stash + .remove(&id) + .ok_or(id) + .expect("could not find stashed leaves"), + } + } +} + +impl From for LeafStorage { + fn from(key: u64) -> LeafStorage { + LeafStorage::Stashed(key) + } +} + +impl From for LeafStorage { + fn from(leaves: Leaves) -> LeafStorage { + LeafStorage::Direct(leaves) + } } From 56f1970d7b70859ae61455c43ecd8185c41ab1d5 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Sun, 9 Aug 2020 19:54:33 +0300 Subject: [PATCH 45/57] fix: use knowledge of longest path this saves a few resizes but no effect in bench. --- unixfs/src/dir/builder/buffered.rs | 2 +- unixfs/src/dir/builder/iter.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/unixfs/src/dir/builder/buffered.rs b/unixfs/src/dir/builder/buffered.rs index ef594c8a1..ff9c9434c 100644 --- a/unixfs/src/dir/builder/buffered.rs +++ b/unixfs/src/dir/builder/buffered.rs @@ -180,7 +180,7 @@ impl BufferingTreeBuilder { /// its data during the walk. `PostOrderIterator` implements `Iterator` while also allowing /// borrowed access via `next_borrowed`. pub fn build(self) -> PostOrderIterator { - PostOrderIterator::new(self.root_builder, self.opts) + PostOrderIterator::new(self.root_builder, self.opts, self.longest_path) } } diff --git a/unixfs/src/dir/builder/iter.rs b/unixfs/src/dir/builder/iter.rs index 932690db8..04f81e68e 100644 --- a/unixfs/src/dir/builder/iter.rs +++ b/unixfs/src/dir/builder/iter.rs @@ -51,10 +51,10 @@ enum Visited { } impl PostOrderIterator { - pub(super) fn new(root: DirBuilder, opts: TreeOptions) -> Self { + pub(super) fn new(root: DirBuilder, opts: TreeOptions, longest_path: usize) -> Self { let root = Visited::DescentRoot(root); PostOrderIterator { - full_path: Default::default(), + full_path: String::with_capacity(longest_path), old_depth: 0, block_buffer: Default::default(), pending: vec![root], From 3876717e5d6724bb80ccf92cf4094c52d9401dfb Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Sun, 9 Aug 2020 20:18:45 +0300 Subject: [PATCH 46/57] doc: cleanup, rename, docs --- unixfs/src/dir/builder/iter.rs | 97 +++++++++++++++------------------- 1 file changed, 42 insertions(+), 55 deletions(-) diff --git a/unixfs/src/dir/builder/iter.rs b/unixfs/src/dir/builder/iter.rs index 04f81e68e..0d4bda474 100644 --- a/unixfs/src/dir/builder/iter.rs +++ b/unixfs/src/dir/builder/iter.rs @@ -23,6 +23,10 @@ pub struct PostOrderIterator { opts: TreeOptions, } +/// The link list used to create the directory node. This list is created from a the BTreeMap +/// inside DirBuilder, and initially it will have `Some` values only for the initial leaves and +/// `None` values for subnodes which are not yet ready. At the time of use, this list is expected +/// to have only `Some` values. type Leaves = Vec>; #[derive(Debug)] @@ -30,7 +34,7 @@ struct NamedLeaf(String, Cid, u64); #[derive(Debug)] enum Visited { - // handle root differently not to infect everything with the Option and so on + // handle root differently not to infect with the Option and Option DescentRoot(DirBuilder), Descent { node: DirBuilder, @@ -116,38 +120,30 @@ impl PostOrderIterator { use cid::Version::*; match self.0.version() { - V0 => self - .0 - .hash() - .as_bytes() - .iter() - .try_for_each(|b| w.write_u8(*b)), + V0 => { /* cidv0 has only the _multi_hash */ } V1 => { - // it is possible that Cidv1 should not be linked to from a unixfs + // it is possible that CidV1 should not be linked to from a unixfs // directory; at least go-ipfs 0.5 `ipfs files` denies making a cbor link + // but happily accepts and does refs over one. w.write_u8(1)?; w.write_varint(u64::from(self.0.codec()))?; - self.0 - .hash() - .as_bytes() - .iter() - .try_for_each(|b| w.write_u8(*b)) } } - } - } - /// Newtype which uses the BTreeMap as Vec. - struct BTreeMappedDir<'a> { - links: &'a [Option], - data: UnixFs<'a>, + self.0 + .hash() + .as_bytes() + .iter() + // while this looks bad it cannot be measured; note we cannot use the + // write_bytes because that is length prefixed bytes write + .try_for_each(|b| w.write_u8(*b)) + } } - /// Newtype which represents an entry from BTreeMap as PBLink as far as the - /// protobuf representation goes. - struct EntryAsPBLink<'a>(&'a NamedLeaf); + /// Custom NamedLeaf as PBLink "adapter." + struct NamedLeafAsPBLink<'a>(&'a NamedLeaf); - impl<'a> MessageWrite for EntryAsPBLink<'a> { + impl<'a> MessageWrite for NamedLeafAsPBLink<'a> { fn get_size(&self) -> usize { use quick_protobuf::sizeofs::*; @@ -172,15 +168,21 @@ impl PostOrderIterator { } } - impl<'a> BTreeMappedDir<'a> { - fn mapped(&self) -> impl Iterator> + '_ { + /// Newtype which uses the &[Option<(NamedLeaf)>] as Vec. + struct CustomFlatUnixFs<'a> { + links: &'a [Option], + data: UnixFs<'a>, + } + + impl<'a> CustomFlatUnixFs<'a> { + fn mapped(&self) -> impl Iterator> + '_ { self.links .iter() - .map(|triple| triple.as_ref().map(|l| EntryAsPBLink(l)).unwrap()) + .map(|triple| triple.as_ref().map(|l| NamedLeafAsPBLink(l)).unwrap()) } } - impl<'a> MessageWrite for BTreeMappedDir<'a> { + impl<'a> MessageWrite for CustomFlatUnixFs<'a> { fn get_size(&self) -> usize { use quick_protobuf::sizeofs::*; @@ -191,6 +193,7 @@ impl PostOrderIterator { links + 1 + sizeof_len(self.data.get_size()) } + fn write_message( &self, w: &mut Writer, @@ -202,7 +205,7 @@ impl PostOrderIterator { } } - let btreed = BTreeMappedDir { + let node = CustomFlatUnixFs { links, data: UnixFs { Type: UnixFsType::Directory, @@ -210,51 +213,35 @@ impl PostOrderIterator { }, }; - /* - - use crate::pb::{FlatUnixFs, PBLink}; - use std::borrow::Cow; - let btreed = FlatUnixFs { - links: links - .iter() // .drain() would be the most reasonable - .map(|(name, Leaf { link, total_size })| PBLink { - Hash: Some(link.to_bytes().into()), - Name: Some(Cow::Borrowed(name.as_str())), - Tsize: Some(*total_size), - }) - .collect::>(), - data: UnixFs { - Type: UnixFsType::Directory, - ..Default::default() - }, - };*/ - /**/ - let size = btreed.get_size(); + let size = node.get_size(); if let Some(limit) = block_size_limit { let size = size as u64; if *limit < size { - // FIXME: this could probably be detected at + // FIXME: this could probably be detected at builder return Err(TreeConstructionFailed::TooLargeBlock(size)); } } - // FIXME: we shouldn't be creating too large structures (bitswap block size limit!) - // FIXME: changing this to autosharding is going to take some thinking - let cap = buffer.capacity(); if let Some(additional) = size.checked_sub(cap) { buffer.reserve(additional); } - if let Some(needed_zeroes) = size.checked_sub(buffer.len()) { + if let Some(mut needed_zeroes) = size.checked_sub(buffer.len()) { + let zeroes = [0; 8]; + + while needed_zeroes > 8 { + buffer.extend_from_slice(&zeroes[..]); + needed_zeroes -= zeroes.len(); + } + buffer.extend(std::iter::repeat(0).take(needed_zeroes)); } let mut writer = Writer::new(BytesWriter::new(&mut buffer[..])); - btreed - .write_message(&mut writer) + node.write_message(&mut writer) .map_err(TreeConstructionFailed::Protobuf)?; buffer.truncate(size); From 3d48caa07945060a49e423fb3dfe66f8e46df139 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Sun, 9 Aug 2020 20:28:11 +0300 Subject: [PATCH 47/57] refactor: custom pb types to own module --- unixfs/src/dir/builder.rs | 6 ++ unixfs/src/dir/builder/custom_pb.rs | 108 +++++++++++++++++++++++++ unixfs/src/dir/builder/iter.rs | 121 +--------------------------- 3 files changed, 118 insertions(+), 117 deletions(-) create mode 100644 unixfs/src/dir/builder/custom_pb.rs diff --git a/unixfs/src/dir/builder.rs b/unixfs/src/dir/builder.rs index 717748c1a..16004c135 100644 --- a/unixfs/src/dir/builder.rs +++ b/unixfs/src/dir/builder.rs @@ -10,6 +10,9 @@ pub use iter::{OwnedTreeNode, PostOrderIterator, TreeNode}; mod buffered; pub use buffered::BufferingTreeBuilder; +mod custom_pb; +use custom_pb::CustomFlatUnixFs; + enum Entry { Leaf(Leaf), Directory(DirBuilder), @@ -143,3 +146,6 @@ impl fmt::Display for TreeConstructionFailed { } impl std::error::Error for TreeConstructionFailed {} + +#[derive(Debug)] +struct NamedLeaf(String, Cid, u64); diff --git a/unixfs/src/dir/builder/custom_pb.rs b/unixfs/src/dir/builder/custom_pb.rs new file mode 100644 index 000000000..ee0a0ad95 --- /dev/null +++ b/unixfs/src/dir/builder/custom_pb.rs @@ -0,0 +1,108 @@ +//! Custom protobuf types which are used in encoding directorys. + +use super::NamedLeaf; +use crate::pb::UnixFs; +use cid::Cid; +use quick_protobuf::{MessageWrite, Writer, WriterBackend}; + +/// Newtype which uses the &[Option<(NamedLeaf)>] as Vec. +pub(super) struct CustomFlatUnixFs<'a> { + pub(super) links: &'a [Option], + pub(super) data: UnixFs<'a>, +} + +impl<'a> CustomFlatUnixFs<'a> { + fn mapped(&self) -> impl Iterator> + '_ { + self.links + .iter() + .map(|triple| triple.as_ref().map(|l| NamedLeafAsPBLink(l)).unwrap()) + } +} + +impl<'a> MessageWrite for CustomFlatUnixFs<'a> { + fn get_size(&self) -> usize { + use quick_protobuf::sizeofs::*; + + let links = self + .mapped() + .map(|link| 1 + sizeof_len(link.get_size())) + .sum::(); + + links + 1 + sizeof_len(self.data.get_size()) + } + + fn write_message(&self, w: &mut Writer) -> quick_protobuf::Result<()> { + self.mapped() + .try_for_each(|l| w.write_with_tag(18, |w| w.write_message(&l)))?; + w.write_with_tag(10, |w| w.write_message(&self.data)) + } +} + +/// Custom NamedLeaf as PBLink "adapter." +struct NamedLeafAsPBLink<'a>(&'a NamedLeaf); + +impl<'a> MessageWrite for NamedLeafAsPBLink<'a> { + fn get_size(&self) -> usize { + use quick_protobuf::sizeofs::*; + + // ones are the tags + 1 + sizeof_len((self.0).0.len()) + + 1 + + sizeof_len(WriteableCid(&(self.0).1).get_size()) + //+ sizeof_len(self.1.link.to_bytes().len()) + + 1 + + sizeof_varint((self.0).2) + } + + fn write_message(&self, w: &mut Writer) -> quick_protobuf::Result<()> { + w.write_with_tag(10, |w| w.write_message(&WriteableCid(&(self.0).1)))?; + //w.write_with_tag(10, |w| w.write_bytes(&self.1.link.to_bytes()))?; + w.write_with_tag(18, |w| w.write_string((self.0).0.as_str()))?; + w.write_with_tag(24, |w| w.write_uint64((self.0).2))?; + Ok(()) + } +} + +/// Newtype around Cid to allow embedding it as PBLink::Hash without allocating a vector. +struct WriteableCid<'a>(&'a Cid); + +impl<'a> MessageWrite for WriteableCid<'a> { + fn get_size(&self) -> usize { + use cid::Version::*; + use quick_protobuf::sizeofs::*; + + let hash_len = self.0.hash().as_bytes().len(); + + match self.0.version() { + V0 => hash_len, + V1 => { + let version_len = 1; + let codec_len = sizeof_varint(u64::from(self.0.codec())); + version_len + codec_len + hash_len + } + } + } + + fn write_message(&self, w: &mut Writer) -> quick_protobuf::Result<()> { + use cid::Version::*; + + match self.0.version() { + V0 => { /* cidv0 has only the _multi_hash */ } + V1 => { + // it is possible that CidV1 should not be linked to from a unixfs + // directory; at least go-ipfs 0.5 `ipfs files` denies making a cbor link + // but happily accepts and does refs over one. + w.write_u8(1)?; + w.write_varint(u64::from(self.0.codec()))?; + } + } + + self.0 + .hash() + .as_bytes() + .iter() + // while this looks bad it cannot be measured; note we cannot use the + // write_bytes because that is length prefixed bytes write + .try_for_each(|b| w.write_u8(*b)) + } +} diff --git a/unixfs/src/dir/builder/iter.rs b/unixfs/src/dir/builder/iter.rs index 0d4bda474..f7660bef6 100644 --- a/unixfs/src/dir/builder/iter.rs +++ b/unixfs/src/dir/builder/iter.rs @@ -1,4 +1,6 @@ -use super::{DirBuilder, Entry, Leaf, TreeConstructionFailed, TreeOptions}; +use super::{ + CustomFlatUnixFs, DirBuilder, Entry, Leaf, NamedLeaf, TreeConstructionFailed, TreeOptions, +}; use cid::Cid; use std::collections::HashMap; use std::fmt; @@ -29,9 +31,6 @@ pub struct PostOrderIterator { /// to have only `Some` values. type Leaves = Vec>; -#[derive(Debug)] -struct NamedLeaf(String, Cid, u64); - #[derive(Debug)] enum Visited { // handle root differently not to infect with the Option and Option @@ -76,7 +75,7 @@ impl PostOrderIterator { block_size_limit: &Option, ) -> Result { use crate::pb::{UnixFs, UnixFsType}; - use quick_protobuf::{BytesWriter, MessageWrite, Writer, WriterBackend}; + use quick_protobuf::{BytesWriter, MessageWrite, Writer}; use sha2::{Digest, Sha256}; // FIXME: ideas on how to turn this into a HAMT sharding on some heuristic. we probably @@ -93,118 +92,6 @@ impl PostOrderIterator { // heuristic can be detected *at* bufferedtreewriter. there the split would be easier, and // this would "just" be a single node rendering, and not need any additional states.. - /// Newtype around Cid to allow embedding it as PBLink::Hash without allocating a vector. - struct WriteableCid<'a>(&'a Cid); - - impl<'a> MessageWrite for WriteableCid<'a> { - fn get_size(&self) -> usize { - use cid::Version::*; - use quick_protobuf::sizeofs::*; - - let hash_len = self.0.hash().as_bytes().len(); - - match self.0.version() { - V0 => hash_len, - V1 => { - let version_len = 1; - let codec_len = sizeof_varint(u64::from(self.0.codec())); - version_len + codec_len + hash_len - } - } - } - - fn write_message( - &self, - w: &mut Writer, - ) -> quick_protobuf::Result<()> { - use cid::Version::*; - - match self.0.version() { - V0 => { /* cidv0 has only the _multi_hash */ } - V1 => { - // it is possible that CidV1 should not be linked to from a unixfs - // directory; at least go-ipfs 0.5 `ipfs files` denies making a cbor link - // but happily accepts and does refs over one. - w.write_u8(1)?; - w.write_varint(u64::from(self.0.codec()))?; - } - } - - self.0 - .hash() - .as_bytes() - .iter() - // while this looks bad it cannot be measured; note we cannot use the - // write_bytes because that is length prefixed bytes write - .try_for_each(|b| w.write_u8(*b)) - } - } - - /// Custom NamedLeaf as PBLink "adapter." - struct NamedLeafAsPBLink<'a>(&'a NamedLeaf); - - impl<'a> MessageWrite for NamedLeafAsPBLink<'a> { - fn get_size(&self) -> usize { - use quick_protobuf::sizeofs::*; - - // ones are the tags - 1 + sizeof_len((self.0).0.len()) - + 1 - + sizeof_len(WriteableCid(&(self.0).1).get_size()) - //+ sizeof_len(self.1.link.to_bytes().len()) - + 1 - + sizeof_varint((self.0).2) - } - - fn write_message( - &self, - w: &mut Writer, - ) -> quick_protobuf::Result<()> { - w.write_with_tag(10, |w| w.write_message(&WriteableCid(&(self.0).1)))?; - //w.write_with_tag(10, |w| w.write_bytes(&self.1.link.to_bytes()))?; - w.write_with_tag(18, |w| w.write_string((self.0).0.as_str()))?; - w.write_with_tag(24, |w| w.write_uint64((self.0).2))?; - Ok(()) - } - } - - /// Newtype which uses the &[Option<(NamedLeaf)>] as Vec. - struct CustomFlatUnixFs<'a> { - links: &'a [Option], - data: UnixFs<'a>, - } - - impl<'a> CustomFlatUnixFs<'a> { - fn mapped(&self) -> impl Iterator> + '_ { - self.links - .iter() - .map(|triple| triple.as_ref().map(|l| NamedLeafAsPBLink(l)).unwrap()) - } - } - - impl<'a> MessageWrite for CustomFlatUnixFs<'a> { - fn get_size(&self) -> usize { - use quick_protobuf::sizeofs::*; - - let links = self - .mapped() - .map(|link| 1 + sizeof_len(link.get_size())) - .sum::(); - - links + 1 + sizeof_len(self.data.get_size()) - } - - fn write_message( - &self, - w: &mut Writer, - ) -> quick_protobuf::Result<()> { - for l in self.mapped() { - w.write_with_tag(18, |w| w.write_message(&l))?; - } - w.write_with_tag(10, |w| w.write_message(&self.data)) - } - } - let node = CustomFlatUnixFs { links, data: UnixFs { From 28392da97908368cc29c2a917ac35a9218fa04a5 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Sun, 9 Aug 2020 20:31:11 +0300 Subject: [PATCH 48/57] chore: cleanup, doc --- unixfs/src/dir/builder/iter.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unixfs/src/dir/builder/iter.rs b/unixfs/src/dir/builder/iter.rs index f7660bef6..e3017eda0 100644 --- a/unixfs/src/dir/builder/iter.rs +++ b/unixfs/src/dir/builder/iter.rs @@ -403,9 +403,10 @@ fn update_full_path( } assert_eq!(*old_depth, depth); - // eprintln!("{:>4} {:?}", depth, full_path); } +/// Returns a Vec of the links in order with only the leaves, the given `children` will contain yet +/// incomplete nodes of the tree. fn partition_children_leaves( depth: usize, it: impl Iterator, From 5484f3ed8fd402406773f8c653979767cf07f4e3 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Sun, 9 Aug 2020 20:43:28 +0300 Subject: [PATCH 49/57] fix: make the benchmark more generic --- unixfs/Cargo.toml | 2 +- .../{ingest-linux-tar.rs => ingest-tar.rs} | 17 ++++++++++++++--- 2 files changed, 15 insertions(+), 4 deletions(-) rename unixfs/benches/{ingest-linux-tar.rs => ingest-tar.rs} (87%) diff --git a/unixfs/Cargo.toml b/unixfs/Cargo.toml index cc49a57d6..465490b66 100644 --- a/unixfs/Cargo.toml +++ b/unixfs/Cargo.toml @@ -27,5 +27,5 @@ tar = { default-features = false, version = "0.4" } criterion = "0.3" [[bench]] -name = "ingest-linux-tar" +name = "ingest-tar" harness = false diff --git a/unixfs/benches/ingest-linux-tar.rs b/unixfs/benches/ingest-tar.rs similarity index 87% rename from unixfs/benches/ingest-linux-tar.rs rename to unixfs/benches/ingest-tar.rs index c55bb12de..e632d3a66 100644 --- a/unixfs/benches/ingest-linux-tar.rs +++ b/unixfs/benches/ingest-tar.rs @@ -1,8 +1,19 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion}; pub fn criterion_benchmark(c: &mut Criterion) { - let tar_bytes = std::fs::read("linux-5.6.tar").expect("read failed"); - c.bench_function("ingest-linux-tar", |b| b.iter(|| ingest_tar(&tar_bytes))); + let file = "benchmark.tar"; + + match std::fs::read(file) { + Ok(tar_bytes) => { + c.bench_function("ingest-tar", |b| b.iter(|| ingest_tar(&tar_bytes))); + } + Err(e) if e.kind() == std::io::ErrorKind::NotFound => { + eprintln!("could not find {:?}:", file); + eprintln!("please download a linux kernel and unpack it to enable benchmark. specific version doesn't matter."); + return; + } + Err(e) => panic!("failed to read the {:?}: {}", file, e), + } } fn ingest_tar(bytes: &[u8]) { @@ -42,7 +53,7 @@ fn ingest_tar(bytes: &[u8]) { } if let Some(mut needed) = adder.size_hint().checked_sub(buffer.len()) { - let zeros = [0u8; 64]; + let zeros = [0u8; 8]; while needed > zeros.len() { buffer.extend_from_slice(&zeros[..]); From 71b02cf3707c96269d6bd62980272c22184c3e59 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Sun, 9 Aug 2020 20:43:43 +0300 Subject: [PATCH 50/57] chore: cargo keeps changing the index file, clippy not sure what is the index file hash change. --- Cargo.lock | 2 ++ unixfs/benches/ingest-tar.rs | 20 +++++--------------- unixfs/examples/ingest-tar.rs | 4 ++-- 3 files changed, 9 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5cc832a71..cfab037a9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2310,6 +2310,8 @@ checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" [[package]] name = "quick-protobuf" version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e489d4a83c17ea69b0291630229b5d4c92a94a3bf0165f7f72f506e94cda8b4b" dependencies = [ "byteorder 1.3.4", ] diff --git a/unixfs/benches/ingest-tar.rs b/unixfs/benches/ingest-tar.rs index e632d3a66..d8862f197 100644 --- a/unixfs/benches/ingest-tar.rs +++ b/unixfs/benches/ingest-tar.rs @@ -10,7 +10,6 @@ pub fn criterion_benchmark(c: &mut Criterion) { Err(e) if e.kind() == std::io::ErrorKind::NotFound => { eprintln!("could not find {:?}:", file); eprintln!("please download a linux kernel and unpack it to enable benchmark. specific version doesn't matter."); - return; } Err(e) => panic!("failed to read the {:?}: {}", file, e), } @@ -25,14 +24,15 @@ fn ingest_tar(bytes: &[u8]) { let mut buffer = Vec::new(); let mut archive = tar::Archive::new(std::io::Cursor::new(bytes)); - let mut entries = archive.entries().unwrap(); + let entries = archive.entries().unwrap(); let mut opts = TreeOptions::default(); opts.wrap_with_directory(); let mut tree = BufferingTreeBuilder::new(opts); - while let Some(entry) = entries.next() { - let mut entry = entry.unwrap(); + for entry in entries { + let mut entry = entry.except("assuming good tar"); + let path = std::str::from_utf8(&*entry.path_bytes()) .unwrap() .to_string(); // need to get rid of this @@ -107,17 +107,7 @@ fn ingest_tar(bytes: &[u8]) { while let Some(res) = iter.next_borrowed() { let res = res.unwrap(); - - match &mut last { - Some(ref mut s) => { - s.0 = res.cid.to_owned(); - s.1 = res.total_size; - s.2 = res.block.len(); - } - n @ None => { - *n = Some((res.cid.to_owned(), res.total_size, res.block.len())); - } - } + last = Some((res.cid.to_owned(), res.total_size, res.block.len())); } let last = last.unwrap(); diff --git a/unixfs/examples/ingest-tar.rs b/unixfs/examples/ingest-tar.rs index 14f2e08ca..ad49ad837 100644 --- a/unixfs/examples/ingest-tar.rs +++ b/unixfs/examples/ingest-tar.rs @@ -12,7 +12,7 @@ fn main() { let stdin = stdin.lock(); let mut archive = tar::Archive::new(stdin); - let mut entries = archive.entries().unwrap(); + let entries = archive.entries().unwrap(); let mut buffer = Vec::new(); @@ -20,7 +20,7 @@ fn main() { opts.wrap_with_directory(); let mut tree = BufferingTreeBuilder::new(opts); - while let Some(entry) = entries.next() { + for entry in entries { let mut entry = entry.unwrap(); let path = std::str::from_utf8(&*entry.path_bytes()) .unwrap() From 7446810982375f5494b872786e4a51a5b2d7e294 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Sun, 9 Aug 2020 22:13:29 +0300 Subject: [PATCH 51/57] doc: minor docs and fmt --- unixfs/benches/ingest-tar.rs | 2 +- unixfs/src/dir/builder/iter.rs | 23 ++++++++++------------- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/unixfs/benches/ingest-tar.rs b/unixfs/benches/ingest-tar.rs index d8862f197..149e31ba6 100644 --- a/unixfs/benches/ingest-tar.rs +++ b/unixfs/benches/ingest-tar.rs @@ -31,7 +31,7 @@ fn ingest_tar(bytes: &[u8]) { let mut tree = BufferingTreeBuilder::new(opts); for entry in entries { - let mut entry = entry.except("assuming good tar"); + let mut entry = entry.expect("assuming good tar"); let path = std::str::from_utf8(&*entry.path_bytes()) .unwrap() diff --git a/unixfs/src/dir/builder/iter.rs b/unixfs/src/dir/builder/iter.rs index e3017eda0..7d3d314f7 100644 --- a/unixfs/src/dir/builder/iter.rs +++ b/unixfs/src/dir/builder/iter.rs @@ -31,6 +31,11 @@ pub struct PostOrderIterator { /// to have only `Some` values. type Leaves = Vec>; +/// The nodes in the visit. We need to do a post-order visit, which starts from a single +/// `DescentRoot`, followed by N `Descents` where N is the deepest directory in the tree. On each +/// descent, we'll need to first schedule a `Post` (or `PostRoot`) followed the immediate children +/// of the node. Directories are rendered when all of their direct and indirect descendants have +/// been serialized into NamedLeafs. #[derive(Debug)] enum Visited { // handle root differently not to infect with the Option and Option @@ -39,6 +44,7 @@ enum Visited { node: DirBuilder, name: String, depth: usize, + /// The index in the parents `Leaves` accessible through `PostOrderIterator::persisted_cids`. index: usize, }, Post { @@ -46,6 +52,8 @@ enum Visited { depth: usize, name: String, index: usize, + /// Leaves will be stored directly in this field when there are no DirBuilder descendants, + /// in the `PostOrderIterator::persisted_cids` otherwise. leaves: LeafStorage, }, PostRoot { @@ -168,15 +176,10 @@ impl PostOrderIterator { match visited { Visited::DescentRoot(node) => { let children = &mut self.reused_children; - let leaves = partition_children_leaves(depth, node.nodes.into_iter(), children); - - // initial idea was to validate something with - let any_children = !children.is_empty(); let leaves = if any_children { - // we only need to put the leaves in there in the case of wrapping self.persisted_cids.insert(node.id, leaves); LeafStorage::from(node.id) } else { @@ -193,14 +196,9 @@ impl PostOrderIterator { index, } => { let children = &mut self.reused_children; - let leaves = partition_children_leaves(depth, node.nodes.into_iter(), children); - let any_children = !children.is_empty(); - - // this would be none for only the single first node, however we know already - // this is not the branch DescentRoot - let parent_id = node.parent_id.expect("this is not root"); + let parent_id = node.parent_id.expect("only roots parent_id is None"); let leaves = if any_children { self.persisted_cids.insert(node.id, leaves); @@ -227,7 +225,6 @@ impl PostOrderIterator { .. } => { let leaves = leaves.into_inner(&mut self.persisted_cids); - let buffer = &mut self.block_buffer; let leaf = match Self::render_directory( @@ -450,7 +447,7 @@ impl LeafStorage { Stashed(id) => stash .remove(&id) .ok_or(id) - .expect("could not find stashed leaves"), + .expect("leaves are either stashed or direct, must able to find with id"), } } } From 9df95ad1e86dee98d55f34c7ad43526928bdb790 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Mon, 10 Aug 2020 09:13:44 +0300 Subject: [PATCH 52/57] refactor: remove duplicated ingest-tar example, it remains as bench --- unixfs/examples/ingest-tar.rs | 177 ---------------------------------- 1 file changed, 177 deletions(-) delete mode 100644 unixfs/examples/ingest-tar.rs diff --git a/unixfs/examples/ingest-tar.rs b/unixfs/examples/ingest-tar.rs deleted file mode 100644 index ad49ad837..000000000 --- a/unixfs/examples/ingest-tar.rs +++ /dev/null @@ -1,177 +0,0 @@ -use cid::Cid; -use ipfs_unixfs::dir::builder::{BufferingTreeBuilder, TreeOptions}; -use ipfs_unixfs::file::adder::FileAdder; -use std::fmt; -use std::io::Read; -use std::time::{Duration, Instant}; - -fn main() { - let started = Instant::now(); - - let stdin = std::io::stdin(); - let stdin = stdin.lock(); - - let mut archive = tar::Archive::new(stdin); - let entries = archive.entries().unwrap(); - - let mut buffer = Vec::new(); - - let mut opts = TreeOptions::default(); - opts.wrap_with_directory(); - let mut tree = BufferingTreeBuilder::new(opts); - - for entry in entries { - let mut entry = entry.unwrap(); - let path = std::str::from_utf8(&*entry.path_bytes()) - .unwrap() - .to_string(); // need to get rid of this - - if let Some(_link_name) = entry.link_name_bytes() { - continue; - } - - if !path.ends_with('/') { - let mut adder = FileAdder::default(); - - // with the std::io::Read it'd be good to read into the fileadder, or read into ... - // something. trying to acccess the buffer from in side FileAdder does not seem the be the - // way to go. - - if let Some(needed) = adder.size_hint().checked_sub(buffer.capacity()) { - buffer.reserve(needed); - } - - if let Some(mut needed) = adder.size_hint().checked_sub(buffer.len()) { - let zeros = [0u8; 64]; - - while needed > zeros.len() { - buffer.extend_from_slice(&zeros[..]); - needed -= zeros.len(); - } - - buffer.extend(std::iter::repeat(0).take(needed)); - } - - let mut total_written = 0usize; - - loop { - match entry.read(&mut buffer[0..]).unwrap() { - 0 => { - let blocks = adder.finish(); - let (cid, subtotal) = blocks - .fold( - None, - |acc: Option<(Cid, usize)>, (cid, bytes): (Cid, Vec)| match acc - { - Some((_, total)) => Some((cid, total + bytes.len())), - None => Some((cid, bytes.len())), - }, - ) - .expect("this is probably always present"); - - total_written += subtotal; - - tree.put_file(&path, cid, total_written as u64).unwrap(); - break; - } - n => { - let mut read = 0; - while read < n { - let (blocks, consumed) = adder.push(&buffer[read..n]); - read += consumed; - total_written += blocks.map(|(_, bytes)| bytes.len()).sum::(); - } - } - } - } - } else { - tree.set_metadata(&path[..path.len() - 1], ipfs_unixfs::Metadata::default()) - .unwrap(); - } - } - - let mut iter = tree.build(); - - let mut last: Option<(Cid, u64, usize)> = None; - - while let Some(res) = iter.next_borrowed() { - let res = res.unwrap(); - - match &mut last { - Some(ref mut s) => { - s.0 = res.cid.to_owned(); - s.1 = res.total_size; - s.2 = res.block.len(); - } - n @ None => { - *n = Some((res.cid.to_owned(), res.total_size, res.block.len())); - } - } - } - - let last = last.unwrap(); - - println!("{} ({} bytes), total: {} bytes", last.0, last.2, last.1); - - let process_stats = get_process_stats(started); - - match process_stats { - Ok(all) => eprintln!("{}", all), - Err(wall) => eprintln!("wall_time: {:?}", wall), - } -} - -struct ProcessStats { - user_time: Duration, - system_time: Duration, - max_rss: i64, - wall_time: Duration, -} - -impl fmt::Display for ProcessStats { - fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { - write!( - fmt, - "Max RSS: {} KB, utime: {:?}, stime: {:?}, total: {:?}, wall_time: {:?}", - self.max_rss, - self.user_time, - self.system_time, - self.user_time + self.system_time, - self.wall_time, - ) - } -} - -#[cfg(unix)] -fn get_process_stats(started_at: Instant) -> Result { - fn to_duration(tv: libc::timeval) -> Duration { - assert!(tv.tv_sec >= 0); - Duration::new(tv.tv_sec as u64, tv.tv_usec as u32) - } - - let (max_rss, user_time, system_time) = unsafe { - let mut rusage: libc::rusage = std::mem::zeroed(); - - let retval = libc::getrusage(libc::RUSAGE_SELF, &mut rusage as *mut _); - - assert_eq!(retval, 0); - - (rusage.ru_maxrss, rusage.ru_utime, rusage.ru_stime) - }; - - let user_time = to_duration(user_time); - let system_time = to_duration(system_time); - let wall_time = started_at.elapsed(); - - Ok(ProcessStats { - user_time, - system_time, - max_rss, - wall_time, - }) -} - -#[cfg(not(unix))] -fn get_process_stats(started_at: Instant) -> Result { - Err(started_at.elapsed()) -} From bfc1bf167caed8557811bf4af3831983111d7ed8 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Mon, 10 Aug 2020 09:22:58 +0300 Subject: [PATCH 53/57] refactor: rename put_file => put_link --- http/src/v0/root_files/add.rs | 2 +- unixfs/benches/ingest-tar.rs | 2 +- unixfs/src/dir/builder/buffered.rs | 47 +++++++++++++++--------------- 3 files changed, 25 insertions(+), 26 deletions(-) diff --git a/http/src/v0/root_files/add.rs b/http/src/v0/root_files/add.rs index bb8702458..e5d392f99 100644 --- a/http/src/v0/root_files/add.rs +++ b/http/src/v0/root_files/add.rs @@ -212,7 +212,7 @@ where // using the filename as the path since we can tolerate a single empty named file // however the second one will cause issues - tree.put_file(&filename, root.clone(), total_written) + tree.put_link(&filename, root.clone(), total_written) .map_err(AddError::TreeGathering)?; let filename: Cow<'_, str> = if filename.is_empty() { diff --git a/unixfs/benches/ingest-tar.rs b/unixfs/benches/ingest-tar.rs index 149e31ba6..a91e8d193 100644 --- a/unixfs/benches/ingest-tar.rs +++ b/unixfs/benches/ingest-tar.rs @@ -82,7 +82,7 @@ fn ingest_tar(bytes: &[u8]) { total_written += subtotal; - tree.put_file(&path, cid, total_written as u64).unwrap(); + tree.put_link(&path, cid, total_written as u64).unwrap(); break; } n => { diff --git a/unixfs/src/dir/builder/buffered.rs b/unixfs/src/dir/builder/buffered.rs index ff9c9434c..995fd9607 100644 --- a/unixfs/src/dir/builder/buffered.rs +++ b/unixfs/src/dir/builder/buffered.rs @@ -33,16 +33,15 @@ impl BufferingTreeBuilder { } } - /// Registers the given path to be a link to the cid that follows. - /// - /// FIXME: this should be renamed as "put_leaf" or "put_opaque_leaf". - pub fn put_file( + /// Registers the given path to be a link to the cid that follows. The target leaf should be + /// either a file, directory or symlink but could of course be anything. It will be treated as + /// an opaque link. + pub fn put_link( &mut self, full_path: &str, target: Cid, total_size: u64, ) -> Result<(), TreeBuildingFailed> { - // inserted at the depth let leaf = Leaf { link: target, total_size, @@ -201,13 +200,13 @@ mod tests { Cid::try_from("QmRJHYTNvC3hmd9gJQARxLR1QMEincccBV53bBw524yyq6").unwrap(); builder - .put_file("a/b/c/d/e/f/g.txt", five_block_foobar.clone(), 221) + .put_link("a/b/c/d/e/f/g.txt", five_block_foobar.clone(), 221) .unwrap(); builder - .put_file("a/b/c/d/e/h.txt", five_block_foobar.clone(), 221) + .put_link("a/b/c/d/e/h.txt", five_block_foobar.clone(), 221) .unwrap(); builder - .put_file("a/b/c/d/e/i.txt", five_block_foobar, 221) + .put_link("a/b/c/d/e/i.txt", five_block_foobar, 221) .unwrap(); let actual = builder @@ -237,7 +236,7 @@ mod tests { #[test] fn empty_path() { let mut builder = BufferingTreeBuilder::default(); - builder.put_file("", some_cid(0), 1).unwrap(); + builder.put_link("", some_cid(0), 1).unwrap(); let actual = builder .build() @@ -255,14 +254,14 @@ mod tests { #[should_panic] fn rooted_path() { let mut builder = BufferingTreeBuilder::default(); - builder.put_file("/a", some_cid(0), 1).unwrap(); + builder.put_link("/a", some_cid(0), 1).unwrap(); } #[test] #[should_panic] fn successive_slashes() { let mut builder = BufferingTreeBuilder::default(); - builder.put_file("a//b", some_cid(0), 1).unwrap(); + builder.put_link("a//b", some_cid(0), 1).unwrap(); } #[test] @@ -275,9 +274,9 @@ mod tests { opts.wrap_with_directory(); let mut builder = BufferingTreeBuilder::new(opts); builder - .put_file("a", five_block_foobar.clone(), 221) + .put_link("a", five_block_foobar.clone(), 221) .unwrap(); - builder.put_file("b", five_block_foobar, 221).unwrap(); + builder.put_link("b", five_block_foobar, 221).unwrap(); let actual = builder .build() @@ -303,7 +302,7 @@ mod tests { let mut opts = TreeOptions::default(); opts.wrap_with_directory(); let mut builder = BufferingTreeBuilder::new(opts); - builder.put_file("a", five_block_foobar, 221).unwrap(); + builder.put_link("a", five_block_foobar, 221).unwrap(); let actual = builder .build() @@ -324,24 +323,24 @@ mod tests { #[should_panic] fn denied_multiple_root_dirs() { let mut builder = BufferingTreeBuilder::default(); - builder.put_file("a/c.txt", some_cid(0), 1).unwrap(); - builder.put_file("b/d.txt", some_cid(1), 1).unwrap(); + builder.put_link("a/c.txt", some_cid(0), 1).unwrap(); + builder.put_link("b/d.txt", some_cid(1), 1).unwrap(); } #[test] #[should_panic] fn denied_multiple_root_files() { let mut builder = BufferingTreeBuilder::default(); - builder.put_file("a.txt", some_cid(0), 1).unwrap(); - builder.put_file("b.txt", some_cid(1), 1).unwrap(); + builder.put_link("a.txt", some_cid(0), 1).unwrap(); + builder.put_link("b.txt", some_cid(1), 1).unwrap(); } #[test] #[should_panic] fn using_leaf_as_node() { let mut builder = BufferingTreeBuilder::default(); - builder.put_file("a.txt", some_cid(0), 1).unwrap(); - builder.put_file("a.txt/b.txt", some_cid(1), 1).unwrap(); + builder.put_link("a.txt", some_cid(0), 1).unwrap(); + builder.put_link("a.txt/b.txt", some_cid(1), 1).unwrap(); } #[test] @@ -350,8 +349,8 @@ mod tests { builder .set_metadata("a/b/c/d", Metadata::default()) .unwrap(); - builder.put_file("a/b/c/d/e.txt", some_cid(1), 1).unwrap(); - builder.put_file("a/b/c/d/f.txt", some_cid(2), 1).unwrap(); + builder.put_link("a/b/c/d/e.txt", some_cid(1), 1).unwrap(); + builder.put_link("a/b/c/d/f.txt", some_cid(2), 1).unwrap(); let actual = builder .build() @@ -365,7 +364,7 @@ mod tests { #[test] fn set_metadata_on_file() { let mut builder = BufferingTreeBuilder::default(); - builder.put_file("a/a.txt", some_cid(0), 1).unwrap(); + builder.put_link("a/a.txt", some_cid(0), 1).unwrap(); let err = builder .set_metadata("a/a.txt", Metadata::default()) .unwrap_err(); @@ -384,7 +383,7 @@ mod tests { Cid::try_from("bafyreihakpd7te5nbmlhdk5ntvcvhf2hmfgrvcwna2sddq5zz5342mcbli").unwrap(); let mut builder = BufferingTreeBuilder::default(); - builder.put_file("a/b", target, 12).unwrap(); + builder.put_link("a/b", target, 12).unwrap(); let actual = builder .build() From 665c0c4f859719d64cd31efe665b349f24fc9ead Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Mon, 10 Aug 2020 09:23:11 +0300 Subject: [PATCH 54/57] doc: comment touch ups --- http/src/v0/root_files/add.rs | 7 +++++-- unixfs/benches/ingest-tar.rs | 4 +++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/http/src/v0/root_files/add.rs b/http/src/v0/root_files/add.rs index e5d392f99..a93ae0b37 100644 --- a/http/src/v0/root_files/add.rs +++ b/http/src/v0/root_files/add.rs @@ -204,7 +204,9 @@ where let (root, subtotal) = import_all(&ipfs, adder.finish()) .await .map_err(AddError::Persisting)? - .expect("I think there should always be something from finish -- except if the link block has just been compressed?"); + // there was a bug in ipfs-unixfs however in general the "push" operation + // should flush so that the final finish would still have work to do. + .expect("there should always be something from finish"); total_written += subtotal; @@ -217,7 +219,8 @@ where let filename: Cow<'_, str> = if filename.is_empty() { // cid needs to be repeated if no filename was given; in which case there - // should not be anything to build as tree either. + // should not be anything to build as tree either. however note that during + // the tree building Cow::Owned(root.to_string()) } else { Cow::Owned(filename) diff --git a/unixfs/benches/ingest-tar.rs b/unixfs/benches/ingest-tar.rs index a91e8d193..4dcf7da3a 100644 --- a/unixfs/benches/ingest-tar.rs +++ b/unixfs/benches/ingest-tar.rs @@ -45,8 +45,10 @@ fn ingest_tar(bytes: &[u8]) { let mut adder = FileAdder::default(); // with the std::io::Read it'd be good to read into the fileadder, or read into ... - // something. trying to acccess the buffer from in side FileAdder does not seem the be the + // something. trying to acccess the buffer from inside FileAdder does not seem the be the // way to go. + // + // reusing the buffers between files would make a lot of sense as well if let Some(needed) = adder.size_hint().checked_sub(buffer.capacity()) { buffer.reserve(needed); From 14c27fd1b8e1fff20ddae61906bdb11599aa7dd3 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Mon, 10 Aug 2020 11:44:10 +0300 Subject: [PATCH 55/57] bench: move buffer and fix path growing outside --- unixfs/benches/ingest-tar.rs | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/unixfs/benches/ingest-tar.rs b/unixfs/benches/ingest-tar.rs index 4dcf7da3a..db735f427 100644 --- a/unixfs/benches/ingest-tar.rs +++ b/unixfs/benches/ingest-tar.rs @@ -5,7 +5,13 @@ pub fn criterion_benchmark(c: &mut Criterion) { match std::fs::read(file) { Ok(tar_bytes) => { - c.bench_function("ingest-tar", |b| b.iter(|| ingest_tar(&tar_bytes))); + // warmup should take care of right sizing these + let mut buffer = Vec::new(); + let mut path = String::new(); + + c.bench_function("ingest-tar", |b| { + b.iter(|| ingest_tar(&tar_bytes, &mut buffer, &mut path)) + }); } Err(e) if e.kind() == std::io::ErrorKind::NotFound => { eprintln!("could not find {:?}:", file); @@ -15,14 +21,12 @@ pub fn criterion_benchmark(c: &mut Criterion) { } } -fn ingest_tar(bytes: &[u8]) { +fn ingest_tar(bytes: &[u8], buffer: &mut Vec, path: &mut String) { use cid::Cid; use ipfs_unixfs::dir::builder::{BufferingTreeBuilder, TreeOptions}; use ipfs_unixfs::file::adder::FileAdder; use std::io::Read; - let mut buffer = Vec::new(); - let mut archive = tar::Archive::new(std::io::Cursor::new(bytes)); let entries = archive.entries().unwrap(); @@ -33,9 +37,10 @@ fn ingest_tar(bytes: &[u8]) { for entry in entries { let mut entry = entry.expect("assuming good tar"); - let path = std::str::from_utf8(&*entry.path_bytes()) - .unwrap() - .to_string(); // need to get rid of this + let path_bytes = entry.path_bytes(); + let tmp_path = std::str::from_utf8(&*path_bytes).unwrap(); + path.clear(); + path.push_str(tmp_path); if let Some(_link_name) = entry.link_name_bytes() { continue; From 276e4b7441df2e0cf69f9046034d40ab64f844bf Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Mon, 10 Aug 2020 11:45:58 +0300 Subject: [PATCH 56/57] bench: filter symlinks earlier --- unixfs/benches/ingest-tar.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/unixfs/benches/ingest-tar.rs b/unixfs/benches/ingest-tar.rs index db735f427..a24d8403d 100644 --- a/unixfs/benches/ingest-tar.rs +++ b/unixfs/benches/ingest-tar.rs @@ -37,15 +37,16 @@ fn ingest_tar(bytes: &[u8], buffer: &mut Vec, path: &mut String) { for entry in entries { let mut entry = entry.expect("assuming good tar"); + if let Some(_link_name) = entry.link_name_bytes() { + // TODO: symlinks + continue; + } + let path_bytes = entry.path_bytes(); let tmp_path = std::str::from_utf8(&*path_bytes).unwrap(); path.clear(); path.push_str(tmp_path); - if let Some(_link_name) = entry.link_name_bytes() { - continue; - } - if !path.ends_with('/') { let mut adder = FileAdder::default(); From aaabc37562bd5e27b8d8f1aafb7a4ea5995e9bb0 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 6 Aug 2020 18:10:04 +0300 Subject: [PATCH 57/57] doc: suggestions and notes from code review Co-authored-by: ljedrz --- http/src/v0/root_files/add.rs | 5 +++-- unixfs/src/dir/builder/iter.rs | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/http/src/v0/root_files/add.rs b/http/src/v0/root_files/add.rs index a93ae0b37..71c731360 100644 --- a/http/src/v0/root_files/add.rs +++ b/http/src/v0/root_files/add.rs @@ -219,8 +219,9 @@ where let filename: Cow<'_, str> = if filename.is_empty() { // cid needs to be repeated if no filename was given; in which case there - // should not be anything to build as tree either. however note that during - // the tree building + // should not be anything to build as tree either. note that intentionally + // no such Cid repeating happens when building the tree and a new wrapping + // root will have empty filename in the progress report. Cow::Owned(root.to_string()) } else { Cow::Owned(filename) diff --git a/unixfs/src/dir/builder/iter.rs b/unixfs/src/dir/builder/iter.rs index 7d3d314f7..208e4f519 100644 --- a/unixfs/src/dir/builder/iter.rs +++ b/unixfs/src/dir/builder/iter.rs @@ -15,7 +15,7 @@ pub struct PostOrderIterator { block_buffer: Vec, // our stack of pending work pending: Vec, - // "communication channel" from nested entries back to their parents this hashmap is only used + // "communication channel" from nested entries back to their parents; this hashmap is only used // in the event of mixed child nodes (leaves and nodes). persisted_cids: HashMap>>, reused_children: Vec,