From 0b3cbc4ddb79270a9e41918847d78354f230400f Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 9 Jul 2023 11:00:35 +0200 Subject: [PATCH 01/18] Add support for attribute queries --- Cargo.lock | 3 +++ gix-archive/Cargo.toml | 3 +++ gix-archive/src/write.rs | 20 ++++++++++++++++++-- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1f7caba7bb4..e99db9334ed 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1398,6 +1398,9 @@ dependencies = [ name = "gix-archive" version = "0.2.0" dependencies = [ + "gix-attributes 0.15.0", + "gix-features 0.32.0", + "gix-filter", "gix-hash 0.11.3", "gix-object 0.33.0", "thiserror", diff --git a/gix-archive/Cargo.toml b/gix-archive/Cargo.toml index 6e75a50d093..dc3cc5b78ed 100644 --- a/gix-archive/Cargo.toml +++ b/gix-archive/Cargo.toml @@ -13,5 +13,8 @@ doctest = false [dependencies] thiserror = "1.0.26" +gix-features = { version = "^0.32.0", path = "../gix-features", features = ["progress"] } gix-hash = { version = "^0.11.3", path = "../gix-hash" } gix-object = { version = "^0.33.0", path = "../gix-object" } +gix-attributes = { version = "^0.15.0", path = "../gix-attributes" } +gix-filter = { version = "^0.1.0", path = "../gix-filter" } diff --git a/gix-archive/src/write.rs b/gix-archive/src/write.rs index 0b3e29ea32e..2b69cd236e9 100644 --- a/gix-archive/src/write.rs +++ b/gix-archive/src/write.rs @@ -1,11 +1,27 @@ use crate::{Error, Options}; +use gix_object::bstr::BStr; /// Use `find` to traverse `tree` and fetch the contained blobs to write to `out` configured according to `opts`. +/// `pipeline` is used to convert blobs to their worktree representation, and `attributes` is used to read +/// the `export-ignore` attribute. If set on a directory or blob, it won't be added to the archive. +/// +/// ### Progress and interruptions +/// +/// For per-file progress, integrate progress handling into `find` as it is called for trees and blobs. +/// `find` should also be used for interrupt handling, as it can return an error once per file. +/// For progress on bytes-written, integrate progress reporting into `out`. /// /// ### Limitations /// -/// * `.gitattributes` aren't considered, and filters are not applied, affecting correctness. -pub fn write_to(_tree: &gix_hash::oid, mut _find: Find, mut _out: W, _opts: Options) -> Result<(), Error> +/// * `export-subst` is not support, as it requires the entire formatting engine of `git log`. +pub fn write_to( + _tree: &gix_hash::oid, + mut _find: Find, + _pipeline: &mut gix_filter::Pipeline, + _attributes: impl FnOnce(&BStr, &mut gix_attributes::search::Outcome), + mut _out: W, + _opts: Options, +) -> Result<(), Error> where W: std::io::Write, Find: for<'a> FnMut(&gix_hash::oid, &'a mut Vec) -> Result, E>, From 412067c3b7b5b7568d3ab37190987195381d7299 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 19 Jul 2023 12:40:56 +0200 Subject: [PATCH 02/18] remove the `Format` type in favor of an incredibly simple streaming format The idea is to only get the vital information in a format that can easily be decoded by the API user. This way, all container formats are implemented in the calling crate or application. That way we avoid having to deal with feature toggles for the various utility crates that would implement the stream formats. --- Cargo.lock | 2 + gix-archive/Cargo.toml | 7 ++- gix-archive/src/lib.rs | 70 ++++------------------ gix-archive/src/stream.rs | 123 ++++++++++++++++++++++++++++++++++++++ gix-archive/src/write.rs | 30 ++++++---- 5 files changed, 161 insertions(+), 71 deletions(-) create mode 100644 gix-archive/src/stream.rs diff --git a/Cargo.lock b/Cargo.lock index e99db9334ed..24aaaed4992 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1403,6 +1403,8 @@ dependencies = [ "gix-filter", "gix-hash 0.11.3", "gix-object 0.33.0", + "gix-traverse 0.30.0", + "parking_lot", "thiserror", ] diff --git a/gix-archive/Cargo.toml b/gix-archive/Cargo.toml index dc3cc5b78ed..9bfa5fec0a6 100644 --- a/gix-archive/Cargo.toml +++ b/gix-archive/Cargo.toml @@ -12,9 +12,12 @@ rust-version = "1.65" doctest = false [dependencies] -thiserror = "1.0.26" -gix-features = { version = "^0.32.0", path = "../gix-features", features = ["progress"] } +gix-features = { version = "^0.32.0", path = "../gix-features", features = ["progress", "io-pipe"] } gix-hash = { version = "^0.11.3", path = "../gix-hash" } gix-object = { version = "^0.33.0", path = "../gix-object" } gix-attributes = { version = "^0.15.0", path = "../gix-attributes" } gix-filter = { version = "^0.1.0", path = "../gix-filter" } +gix-traverse = { version = "^0.30.0", path = "../gix-traverse" } + +thiserror = "1.0.26" +parking_lot = "0.12.1" diff --git a/gix-archive/src/lib.rs b/gix-archive/src/lib.rs index 861eee7c79f..d3a63c825fc 100644 --- a/gix-archive/src/lib.rs +++ b/gix-archive/src/lib.rs @@ -1,63 +1,19 @@ //! The implementation of creating an archive from a git tree, similar to `git archive`. -#![deny(rust_2018_idioms, missing_docs)] -#![forbid(unsafe_code)] +#![deny(rust_2018_idioms, missing_docs, unsafe_code)] -/// The error returned by [`write_to()`]. -#[derive(Debug, thiserror::Error)] -#[allow(missing_docs)] -pub enum Error -where - E: std::error::Error + Send + Sync + 'static, -{ - #[error(transparent)] - Io(#[from] std::io::Error), - #[error("Could not find a blob or tree for archival")] - Find(#[source] E), -} - -/// The supported container formats for use in [`write_to()`]. -#[derive(Default, PartialEq, Eq, Copy, Clone, Debug)] -pub enum Format { - /// A standard `tar` archive. - /// - /// Use it as well if a custom container format is desired. The idea is to decode it on a separate thread - /// to rewrite the data to the desired format. - #[default] - Tar, - /// A convenience format that will `zip` deflate the `tar` stream. - TarGz { - /// The compression level to use for the `zlib` compression, ranging from 0 (no compression) to 9 (best compression). - compression_level: u8, - }, - /// Use the zip` container format, instead of `tar`, provided for convenience. - Zip { - /// The compression level to use for the `zlib` compression, ranging from 0 (no compression) to 9 (best compression). - compression_level: u8, - }, -} - -/// Options for configuring [`write_to()`]. -#[derive(Clone, Debug)] -pub struct Options { - /// The archive's format. - pub format: Format, - /// Given a `path`, originating in the git tree, to place into the archive, put `/path` in front of it. - pub tree_prefix: Option, - /// The modification time for all entries in the archive. - /// - /// Defaults to the current time. The caller may set this to the commit time if available. - pub modification_time: std::time::SystemTime, -} +use gix_object::bstr::BString; +use std::sync::Arc; -impl Default for Options { - fn default() -> Self { - Options { - format: Default::default(), - tree_prefix: None, - modification_time: std::time::SystemTime::now(), - } - } -} +/// +pub mod stream; mod write; pub use write::write_to; + +/// A stream of entries that is produced from an underlying reader. +pub struct Stream { + read: gix_features::io::pipe::Reader, + err: Arc>>, + /// `None` if currently held by an entry. + path_buf: Option, +} diff --git a/gix-archive/src/stream.rs b/gix-archive/src/stream.rs new file mode 100644 index 00000000000..e3558779e96 --- /dev/null +++ b/gix-archive/src/stream.rs @@ -0,0 +1,123 @@ +use crate::Stream; +use gix_object::bstr::{BStr, BString}; +use std::io::{ErrorKind, Read}; + +/// The error returned by [`next_entry()`][Stream::next_entry()]. +#[derive(Debug, thiserror::Error)] +#[allow(missing_docs)] +pub enum Error { + #[error(transparent)] + Io(#[from] std::io::Error), + #[error("Could not find a blob or tree for archival")] + Find(#[source] Box), + #[error("Could not query attributes for path \"{path}\"")] + Attributes { + path: BString, + source: Box, + }, +} + +/// An entry in a stream. Note that they must be consumed fully, by reading from them till exhaustion. +/// +/// ### Drop behaviour +/// +/// If the entry is dropped without reading it till exhaustion, the stream is tainted and +/// [`next_entry()`][Stream::next_entry()] will panic next time it is called. +pub struct Entry<'a> { + /// Access to our parent + parent: &'a mut Stream, + + /// The path relative to the repository at which data should be written. + path_buf: Option, + /// The amount of bytes left to read + remaining: usize, +} + +impl Entry<'_> { + /// Return the path of this entry as slash-separated path relative to the repository. + pub fn relative_path(&self) -> &BStr { + self.path_buf.as_ref().expect("always set during our lifetime").as_ref() + } +} + +impl<'a> Drop for Entry<'a> { + fn drop(&mut self) { + if self.remaining == 0 { + self.parent.path_buf = self.path_buf.take(); + } + } +} + +impl std::io::Read for Entry<'_> { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + let buf_len = buf.len(); + if let Some(err) = self.parent.err.lock().take() { + return Err(std::io::Error::new(ErrorKind::Other, err)); + } + let bytes_read = self.parent.read.read(&mut buf[..buf_len.min(self.remaining)])?; + self.remaining -= bytes_read; + Ok(bytes_read) + } +} + +impl Stream { + /// Access the next entry of the stream or `None` if there is nothing more to read. + pub fn next_entry(&mut self) -> Result>, Error> { + assert!( + self.path_buf.is_some(), + "BUG: must consume and drop entry before getting the next one" + ); + let res = self.read_entry_info(); + match res { + Ok(remaining) => { + if let Some(err) = self.err.lock().take() { + return Err(err); + } + Ok(Some(Entry { + path_buf: self.path_buf.take(), + parent: self, + remaining, + })) + } + Err(err) => { + if err.kind() == ErrorKind::UnexpectedEof { + if let Some(err) = self.err.lock().take() { + return Err(err); + } + } + Err(err.into()) + } + } + } + + // Format: [usize-LE][usize-LE][relative_path_bytes][object_stream] + fn read_entry_info(&mut self) -> Result { + let mut buf = [0; std::mem::size_of::()]; + + self.read.read_exact(&mut buf)?; + let path_len = usize::from_le_bytes(buf); + + self.read.read_exact(&mut buf)?; + let stream_size = usize::from_le_bytes(buf); + + let path_buf = self.path_buf.as_mut().expect("set while producing an entry"); + clear_and_set_capacity(path_buf, path_len); + + // SAFETY: `clear_and_set_capacity` assures the vec has the right capacity to hold `path_len` + #[allow(unsafe_code)] + unsafe { + path_buf.set_len(path_len); + } + self.read.read_exact(path_buf)?; + + Ok(stream_size) + } +} + +fn clear_and_set_capacity(buf: &mut Vec, cap: usize) { + buf.clear(); + if buf.capacity() < cap { + buf.reserve(cap); + assert!(buf.capacity() >= cap, "{} >= {}", buf.capacity(), cap); + } +} diff --git a/gix-archive/src/write.rs b/gix-archive/src/write.rs index 2b69cd236e9..40c664d91ce 100644 --- a/gix-archive/src/write.rs +++ b/gix-archive/src/write.rs @@ -1,7 +1,8 @@ -use crate::{Error, Options}; +use crate::Stream; use gix_object::bstr::BStr; -/// Use `find` to traverse `tree` and fetch the contained blobs to write to `out` configured according to `opts`. +/// Use `find` to traverse `tree` and fetch the contained blobs to return as [`Stream`], which makes them queryable +/// on demand with support for streaming each entry. /// `pipeline` is used to convert blobs to their worktree representation, and `attributes` is used to read /// the `export-ignore` attribute. If set on a directory or blob, it won't be added to the archive. /// @@ -9,23 +10,28 @@ use gix_object::bstr::BStr; /// /// For per-file progress, integrate progress handling into `find` as it is called for trees and blobs. /// `find` should also be used for interrupt handling, as it can return an error once per file. -/// For progress on bytes-written, integrate progress reporting into `out`. +/// For progress on bytes-written, integrate progress reporting when consuming the stream. +/// Further it's possible to drop the returned [`Stream`] to halt all operation. +/// +/// ### Threaded Operation +/// +/// This function spawns a thread that will access the tree data in the background, synchronized through +/// `Stream` so that it will not be faster than the consumer, with at most one file in flight at any time. /// /// ### Limitations /// /// * `export-subst` is not support, as it requires the entire formatting engine of `git log`. -pub fn write_to( - _tree: &gix_hash::oid, +pub fn write_to( + _tree: gix_hash::ObjectId, mut _find: Find, _pipeline: &mut gix_filter::Pipeline, - _attributes: impl FnOnce(&BStr, &mut gix_attributes::search::Outcome), - mut _out: W, - _opts: Options, -) -> Result<(), Error> + _attributes: impl FnMut(&BStr, &mut gix_attributes::search::Outcome) -> Result<(), E2> + Send, +) -> Stream where W: std::io::Write, - Find: for<'a> FnMut(&gix_hash::oid, &'a mut Vec) -> Result, E>, - E: std::error::Error + Send + Sync + 'static, + Find: for<'a> FnMut(&gix_hash::oid, &'a mut Vec) -> Result, E1> + Send, + E1: std::error::Error + Send + Sync + 'static, + E2: std::error::Error + Send + Sync + 'static, { - Ok(()) + todo!() } From 0634d543dacc9b3ce3d39938b116a890bcb0686f Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 19 Jul 2023 19:18:46 +0200 Subject: [PATCH 03/18] fix!: make information about whether or not the matched item is a directory available. This is important for matching, even though it works differently than excludes. --- gix-attributes/src/search/attributes.rs | 6 ++++-- gix-attributes/tests/search/mod.rs | 8 ++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/gix-attributes/src/search/attributes.rs b/gix-attributes/src/search/attributes.rs index 078c187bbfc..8611dcbd614 100644 --- a/gix-attributes/src/search/attributes.rs +++ b/gix-attributes/src/search/attributes.rs @@ -103,13 +103,14 @@ impl Search { &'a self, relative_path: impl Into<&'b BStr>, case: gix_glob::pattern::Case, + is_dir: Option, out: &mut Outcome, ) -> bool { let relative_path = relative_path.into(); let basename_pos = relative_path.rfind(b"/").map(|p| p + 1); let mut has_match = false; self.patterns.iter().rev().any(|pl| { - has_match |= pattern_matching_relative_path(pl, relative_path, basename_pos, case, out); + has_match |= pattern_matching_relative_path(pl, relative_path, basename_pos, case, is_dir, out); out.is_done() }); has_match @@ -201,6 +202,7 @@ fn pattern_matching_relative_path( relative_path: &BStr, basename_pos: Option, case: gix_glob::pattern::Case, + is_dir: Option, out: &mut Outcome, ) -> bool { let (relative_path, basename_start_pos) = @@ -227,7 +229,7 @@ fn pattern_matching_relative_path( Value::Assignments(attrs) => attrs, }; if out.has_unspecified_attributes(attrs.iter().map(|attr| attr.id)) - && pattern.matches_repo_relative_path(relative_path, basename_start_pos, None, case) + && pattern.matches_repo_relative_path(relative_path, basename_start_pos, is_dir, case) { let all_filled = out.fill_attributes(attrs.iter(), pattern, list.source.as_ref(), *sequence_number); if all_filled { diff --git a/gix-attributes/tests/search/mod.rs b/gix-attributes/tests/search/mod.rs index a85ec2fbcfd..ef3650d0581 100644 --- a/gix-attributes/tests/search/mod.rs +++ b/gix-attributes/tests/search/mod.rs @@ -50,7 +50,7 @@ mod specials { ); let mut out = Outcome::default(); out.initialize(&collection); - search.pattern_matching_relative_path(path, case, &mut out) + search.pattern_matching_relative_path(path, case, None, &mut out) } fn searchi(pattern: &str, path: &str, rela_containing_dir: Option<&str>) -> bool { @@ -100,7 +100,7 @@ fn baseline() -> crate::Result { actual.initialize(&collection); for (rela_path, expected) in (baseline::Expectations { lines: input.lines() }) { actual.reset(); - let has_match = group.pattern_matching_relative_path(rela_path, case, &mut actual); + let has_match = group.pattern_matching_relative_path(rela_path, case, None, &mut actual); assert_references(&actual); let actual: Vec<_> = actual .iter() @@ -202,7 +202,7 @@ fn all_attributes_are_listed_in_declaration_order() -> crate::Result { for (rela_path, expected) in (baseline::Expectations { lines: input.lines() }) { out.reset(); - group.pattern_matching_relative_path(rela_path, Case::Sensitive, &mut out); + group.pattern_matching_relative_path(rela_path, Case::Sensitive, None, &mut out); assert_references(&out); let actual: Vec<_> = out.iter().map(|m| m.assignment).collect(); assert_eq!( @@ -239,7 +239,7 @@ fn given_attributes_are_made_available_in_given_order() -> crate::Result { for (rela_path, expected) in (baseline::Expectations { lines: input.lines() }) { out.reset(); - group.pattern_matching_relative_path(rela_path, Case::Sensitive, &mut out); + group.pattern_matching_relative_path(rela_path, Case::Sensitive, None, &mut out); assert_references(&out); let actual: Vec<_> = out.iter_selected().map(|m| m.assignment).collect(); assert_eq!( From 68bd71cc0c47b0d86c7cb8fb9fe73a03cf8b52f6 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 19 Jul 2023 19:19:55 +0200 Subject: [PATCH 04/18] fix!: make it possible to pass information about the directory status when matching attributes. This is significant for archiving operations, even though it's not important when matching attributes otherwise. --- gix-worktree/src/cache/platform.rs | 2 +- gix-worktree/src/cache/state/attributes.rs | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/gix-worktree/src/cache/platform.rs b/gix-worktree/src/cache/platform.rs index 27d0bfbc822..d07ef6e8858 100644 --- a/gix-worktree/src/cache/platform.rs +++ b/gix-worktree/src/cache/platform.rs @@ -44,7 +44,7 @@ impl<'a> Platform<'a> { let attrs = self.parent.state.attributes_or_panic(); let relative_path = gix_path::to_unix_separators_on_windows(gix_path::into_bstr(self.parent.stack.current_relative())); - attrs.matching_attributes(relative_path.as_bstr(), self.parent.case, out) + attrs.matching_attributes(relative_path.as_bstr(), self.parent.case, self.is_dir, out) } } diff --git a/gix-worktree/src/cache/state/attributes.rs b/gix-worktree/src/cache/state/attributes.rs index c9c6a14d862..00b61544879 100644 --- a/gix-worktree/src/cache/state/attributes.rs +++ b/gix-worktree/src/cache/state/attributes.rs @@ -183,6 +183,7 @@ impl Attributes { &self, relative_path: &BStr, case: Case, + is_dir: Option, out: &mut gix_attributes::search::Outcome, ) -> bool { // assure `out` is ready to deal with possibly changed collections (append-only) @@ -191,7 +192,7 @@ impl Attributes { let groups = [&self.globals, &self.stack]; let mut has_match = false; groups.iter().rev().any(|group| { - has_match |= group.pattern_matching_relative_path(relative_path, case, out); + has_match |= group.pattern_matching_relative_path(relative_path, case, is_dir, out); out.is_done() }); has_match From a9eab8d59442be19e05be4912c756188675b2bda Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 20 Jul 2023 10:44:43 +0200 Subject: [PATCH 05/18] feat: add `is_executable()` function to determine if metadata of a file is executable. --- gix-fs/src/lib.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/gix-fs/src/lib.rs b/gix-fs/src/lib.rs index aa576c24062..f4286d727c9 100644 --- a/gix-fs/src/lib.rs +++ b/gix-fs/src/lib.rs @@ -51,5 +51,18 @@ pub struct Stack { current_is_directory: bool, } +#[cfg(unix)] +/// Returns whether a a file has the executable permission set. +pub fn is_executable(metadata: &std::fs::Metadata) -> bool { + use std::os::unix::fs::MetadataExt; + (metadata.mode() & 0o100) != 0 +} + +#[cfg(not(unix))] +/// Returns whether a a file has the executable permission set. +pub fn is_executable(_metadata: &std::fs::Metadata) -> bool { + false +} + /// pub mod stack; From 1c1d19b715b4c3e716ebcde643cad9a75912e5fc Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 20 Jul 2023 10:45:51 +0200 Subject: [PATCH 06/18] Use new `gix-fs` capabilities --- Cargo.lock | 1 + gix-index/Cargo.toml | 1 + gix-index/src/entry/mode.rs | 21 +++++---------------- 3 files changed, 7 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 24aaaed4992..baf5f0b4940 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1851,6 +1851,7 @@ dependencies = [ "filetime", "gix-bitmap 0.2.5", "gix-features 0.32.0", + "gix-fs 0.4.0", "gix-hash 0.11.3", "gix-lock 7.0.1", "gix-object 0.33.0", diff --git a/gix-index/Cargo.toml b/gix-index/Cargo.toml index be11c7c4ee7..bf7bd769793 100644 --- a/gix-index/Cargo.toml +++ b/gix-index/Cargo.toml @@ -26,6 +26,7 @@ gix-bitmap = { version = "^0.2.5", path = "../gix-bitmap" } gix-object = { version = "^0.33.0", path = "../gix-object" } gix-traverse = { version = "^0.30.0", path = "../gix-traverse" } gix-lock = { version = "^7.0.0", path = "../gix-lock" } +gix-fs = { version = "^0.4.0", path = "../gix-fs" } thiserror = "1.0.32" memmap2 = "0.7.0" diff --git a/gix-index/src/entry/mode.rs b/gix-index/src/entry/mode.rs index 7d3fdf506a9..9adaa1bd5c5 100644 --- a/gix-index/src/entry/mode.rs +++ b/gix-index/src/entry/mode.rs @@ -1,18 +1,5 @@ use crate::entry::Mode; -#[cfg(unix)] -/// Returns whether a a file has the executable permission set. -fn is_executable(metadata: &std::fs::Metadata) -> bool { - use std::os::unix::fs::MetadataExt; - (metadata.mode() & 0o100) != 0 -} - -#[cfg(not(unix))] -/// Returns whether a a file has the executable permission set. -fn is_executable(_metadata: &std::fs::Metadata) -> bool { - false -} - impl Mode { /// Return true if this is a sparse entry, as it points to a directory which usually isn't what an 'unsparse' index tracks. pub fn is_sparse(&self) -> bool { @@ -54,13 +41,15 @@ impl Mode { Mode::SYMLINK if has_symlinks && !stat.is_symlink() => (), Mode::SYMLINK if !has_symlinks && !stat.is_file() => (), Mode::COMMIT | Mode::DIR if !stat.is_dir() => (), - Mode::FILE if executable_bit && is_executable(stat) => return Some(Change::ExecutableBit), - Mode::FILE_EXECUTABLE if executable_bit && !is_executable(stat) => return Some(Change::ExecutableBit), + Mode::FILE if executable_bit && gix_fs::is_executable(stat) => return Some(Change::ExecutableBit), + Mode::FILE_EXECUTABLE if executable_bit && !gix_fs::is_executable(stat) => { + return Some(Change::ExecutableBit) + } _ => return None, }; let new_mode = if stat.is_dir() { Mode::COMMIT - } else if executable_bit && is_executable(stat) { + } else if executable_bit && gix_fs::is_executable(stat) { Mode::FILE_EXECUTABLE } else { Mode::FILE From ca9294a597d3b4a19aa6338e9ba0893269b9d1a2 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 19 Jul 2023 14:23:54 +0200 Subject: [PATCH 07/18] feat: implement `write_to()` in full. --- Cargo.lock | 5 + gix-archive/Cargo.toml | 7 + gix-archive/src/lib.rs | 21 +- gix-archive/src/stream.rs | 123 -------- gix-archive/src/stream/entry.rs | 98 ++++++ gix-archive/src/stream/mod.rs | 192 ++++++++++++ gix-archive/src/stream/protocol.rs | 175 +++++++++++ gix-archive/src/write.rs | 37 --- gix-archive/src/write/mod.rs | 151 +++++++++ gix-archive/src/write/traverse.rs | 153 +++++++++ gix-archive/tests/archive.rs | 294 ++++++++++++++++++ gix-archive/tests/fixtures/basic.sh | 31 ++ .../fixtures/generated-archives/basic.tar.xz | 3 + 13 files changed, 1125 insertions(+), 165 deletions(-) delete mode 100644 gix-archive/src/stream.rs create mode 100644 gix-archive/src/stream/entry.rs create mode 100644 gix-archive/src/stream/mod.rs create mode 100644 gix-archive/src/stream/protocol.rs delete mode 100644 gix-archive/src/write.rs create mode 100644 gix-archive/src/write/mod.rs create mode 100644 gix-archive/src/write/traverse.rs create mode 100644 gix-archive/tests/fixtures/basic.sh create mode 100644 gix-archive/tests/fixtures/generated-archives/basic.tar.xz diff --git a/Cargo.lock b/Cargo.lock index baf5f0b4940..91f622e6799 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1401,9 +1401,14 @@ dependencies = [ "gix-attributes 0.15.0", "gix-features 0.32.0", "gix-filter", + "gix-fs 0.4.0", "gix-hash 0.11.3", "gix-object 0.33.0", + "gix-odb", + "gix-path 0.8.3", + "gix-testtools", "gix-traverse 0.30.0", + "gix-worktree 0.22.0", "parking_lot", "thiserror", ] diff --git a/gix-archive/Cargo.toml b/gix-archive/Cargo.toml index 9bfa5fec0a6..8c25e4b036c 100644 --- a/gix-archive/Cargo.toml +++ b/gix-archive/Cargo.toml @@ -18,6 +18,13 @@ gix-object = { version = "^0.33.0", path = "../gix-object" } gix-attributes = { version = "^0.15.0", path = "../gix-attributes" } gix-filter = { version = "^0.1.0", path = "../gix-filter" } gix-traverse = { version = "^0.30.0", path = "../gix-traverse" } +gix-fs = { version = "^0.4.0", path = "../gix-fs" } +gix-path = { version = "^0.8.3", path = "../gix-path" } thiserror = "1.0.26" parking_lot = "0.12.1" + +[dev-dependencies] +gix-testtools = { path = "../tests/tools"} +gix-odb = { path = "../gix-odb"} +gix-worktree = { path = "../gix-worktree"} diff --git a/gix-archive/src/lib.rs b/gix-archive/src/lib.rs index d3a63c825fc..7fae7820eb6 100644 --- a/gix-archive/src/lib.rs +++ b/gix-archive/src/lib.rs @@ -1,8 +1,9 @@ -//! The implementation of creating an archive from a git tree, similar to `git archive`. +//! The implementation of creating an archive from a git tree, similar to `git archive`, but using an internal format. +//! +//! This crate can effectively be used to manipulate worktrees as streams of bytes, which can be decoded using the [`Stream`] type. #![deny(rust_2018_idioms, missing_docs, unsafe_code)] use gix_object::bstr::BString; -use std::sync::Arc; /// pub mod stream; @@ -10,10 +11,20 @@ pub mod stream; mod write; pub use write::write_to; -/// A stream of entries that is produced from an underlying reader. +/// A stream of entries that originate from a git tree and optionally from additional entries. +/// +/// Note that a git tree is mandatory, but the empty tree can be used to effectively disable it. pub struct Stream { - read: gix_features::io::pipe::Reader, - err: Arc>>, + read: stream::utils::Read, + err: stream::SharedErrorSlot, + extra_entries: Option>, + // additional_entries: Vec, /// `None` if currently held by an entry. path_buf: Option, + /// Another buffer to partially act like a buf-reader. + buf: Vec, + /// The offset into `buf` for entries being able to act like a buf reader. + pos: usize, + /// The amount of bytes usable from `buf` (even though it always has a fixed size) + filled: usize, } diff --git a/gix-archive/src/stream.rs b/gix-archive/src/stream.rs deleted file mode 100644 index e3558779e96..00000000000 --- a/gix-archive/src/stream.rs +++ /dev/null @@ -1,123 +0,0 @@ -use crate::Stream; -use gix_object::bstr::{BStr, BString}; -use std::io::{ErrorKind, Read}; - -/// The error returned by [`next_entry()`][Stream::next_entry()]. -#[derive(Debug, thiserror::Error)] -#[allow(missing_docs)] -pub enum Error { - #[error(transparent)] - Io(#[from] std::io::Error), - #[error("Could not find a blob or tree for archival")] - Find(#[source] Box), - #[error("Could not query attributes for path \"{path}\"")] - Attributes { - path: BString, - source: Box, - }, -} - -/// An entry in a stream. Note that they must be consumed fully, by reading from them till exhaustion. -/// -/// ### Drop behaviour -/// -/// If the entry is dropped without reading it till exhaustion, the stream is tainted and -/// [`next_entry()`][Stream::next_entry()] will panic next time it is called. -pub struct Entry<'a> { - /// Access to our parent - parent: &'a mut Stream, - - /// The path relative to the repository at which data should be written. - path_buf: Option, - /// The amount of bytes left to read - remaining: usize, -} - -impl Entry<'_> { - /// Return the path of this entry as slash-separated path relative to the repository. - pub fn relative_path(&self) -> &BStr { - self.path_buf.as_ref().expect("always set during our lifetime").as_ref() - } -} - -impl<'a> Drop for Entry<'a> { - fn drop(&mut self) { - if self.remaining == 0 { - self.parent.path_buf = self.path_buf.take(); - } - } -} - -impl std::io::Read for Entry<'_> { - fn read(&mut self, buf: &mut [u8]) -> std::io::Result { - let buf_len = buf.len(); - if let Some(err) = self.parent.err.lock().take() { - return Err(std::io::Error::new(ErrorKind::Other, err)); - } - let bytes_read = self.parent.read.read(&mut buf[..buf_len.min(self.remaining)])?; - self.remaining -= bytes_read; - Ok(bytes_read) - } -} - -impl Stream { - /// Access the next entry of the stream or `None` if there is nothing more to read. - pub fn next_entry(&mut self) -> Result>, Error> { - assert!( - self.path_buf.is_some(), - "BUG: must consume and drop entry before getting the next one" - ); - let res = self.read_entry_info(); - match res { - Ok(remaining) => { - if let Some(err) = self.err.lock().take() { - return Err(err); - } - Ok(Some(Entry { - path_buf: self.path_buf.take(), - parent: self, - remaining, - })) - } - Err(err) => { - if err.kind() == ErrorKind::UnexpectedEof { - if let Some(err) = self.err.lock().take() { - return Err(err); - } - } - Err(err.into()) - } - } - } - - // Format: [usize-LE][usize-LE][relative_path_bytes][object_stream] - fn read_entry_info(&mut self) -> Result { - let mut buf = [0; std::mem::size_of::()]; - - self.read.read_exact(&mut buf)?; - let path_len = usize::from_le_bytes(buf); - - self.read.read_exact(&mut buf)?; - let stream_size = usize::from_le_bytes(buf); - - let path_buf = self.path_buf.as_mut().expect("set while producing an entry"); - clear_and_set_capacity(path_buf, path_len); - - // SAFETY: `clear_and_set_capacity` assures the vec has the right capacity to hold `path_len` - #[allow(unsafe_code)] - unsafe { - path_buf.set_len(path_len); - } - self.read.read_exact(path_buf)?; - - Ok(stream_size) - } -} - -fn clear_and_set_capacity(buf: &mut Vec, cap: usize) { - buf.clear(); - if buf.capacity() < cap { - buf.reserve(cap); - assert!(buf.capacity() >= cap, "{} >= {}", buf.capacity(), cap); - } -} diff --git a/gix-archive/src/stream/entry.rs b/gix-archive/src/stream/entry.rs new file mode 100644 index 00000000000..801cd32c688 --- /dev/null +++ b/gix-archive/src/stream/entry.rs @@ -0,0 +1,98 @@ +use crate::stream::Entry; +use gix_object::bstr::BStr; +use std::io::{ErrorKind, Read}; +use std::path::PathBuf; + +/// The source of an additional entry +pub enum Source { + /// There is no content, typically the case with directories which are always considered empty. + Null, + /// Read from the file at the given path. + Path(PathBuf), + /// Read from memory. + Memory(Vec), +} + +impl Source { + pub(crate) fn len(&self) -> Option { + match self { + Source::Null => Some(0), + Source::Path(_) => None, + Source::Memory(buf) => Some(buf.len()), + } + } +} + +impl std::fmt::Debug for Entry<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Entry") + .field("path_buf", &self.relative_path()) + .field("mode", &self.mode) + .field("id", &self.id) + .field("remaining", &self.remaining) + .finish() + } +} + +impl Entry<'_> { + /// Return the path of this entry as slash-separated path relative to the repository. + pub fn relative_path(&self) -> &BStr { + self.path_buf.as_ref().expect("always set during our lifetime").as_ref() + } +} + +impl<'a> Drop for Entry<'a> { + fn drop(&mut self) { + if self.remaining == Some(0) { + self.parent.path_buf = self.path_buf.take(); + } + } +} + +impl Entry<'_> { + fn fill_buf(&mut self) -> std::io::Result<&[u8]> { + if self.parent.pos >= self.parent.filled { + let mut u16_buf = [0; 2]; + self.parent.read.read_exact(&mut u16_buf)?; + let nb = u16::from_le_bytes(u16_buf) as usize; + + if nb != 0 { + self.parent + .read + .read_exact(&mut self.parent.buf[self.parent.filled..][..nb])?; + } + self.parent.filled = nb; + self.parent.pos = 0; + } + Ok(&self.parent.buf[self.parent.pos..self.parent.filled]) + } +} + +impl std::io::Read for Entry<'_> { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + let buf_len = buf.len(); + if let Some(err) = self.parent.err.lock().take() { + return Err(std::io::Error::new(ErrorKind::Other, err)); + } + let bytes_read = match self.remaining.as_mut() { + None => { + // We expect a zero-read to indicate the end of stream, which is the default way of streams to end. + // In our case though, it requires sending an extra zero-write, so we avoid that usually. + let input = self.fill_buf()?; + let nb = input.len().min(buf.len()); + buf[..nb].copy_from_slice(&input[..nb]); + self.parent.pos += nb; + nb + } + Some(remaining) => { + let bytes_read = self.parent.read.read(&mut buf[..buf_len.min(*remaining)])?; + *remaining -= bytes_read; + bytes_read + } + }; + if bytes_read == 0 { + self.remaining = Some(0); + } + Ok(bytes_read) + } +} diff --git a/gix-archive/src/stream/mod.rs b/gix-archive/src/stream/mod.rs new file mode 100644 index 00000000000..20383e5a4a0 --- /dev/null +++ b/gix-archive/src/stream/mod.rs @@ -0,0 +1,192 @@ +use crate::Stream; +use gix_object::bstr::BString; +use std::path::Path; +use std::sync::Arc; + +pub(crate) type SharedErrorSlot = Arc>>; + +/// The error returned by [`next_entry()`][Stream::next_entry()]. +#[derive(Debug, thiserror::Error)] +#[allow(missing_docs)] +pub enum Error { + #[error(transparent)] + Io(#[from] std::io::Error), + #[error("Could not find a blob or tree for archival")] + Find(#[source] Box), + #[error("Could not query attributes for path \"{path}\"")] + Attributes { + path: BString, + source: Box, + }, + #[error(transparent)] + Traverse(#[from] gix_traverse::tree::breadthfirst::Error), + #[error(transparent)] + ConvertToWorktree(#[from] gix_filter::pipeline::convert::to_worktree::Error), +} + +/// An entry in a stream. Note that they must be consumed fully, by reading from them till exhaustion. +/// +/// ### Drop behaviour +/// +/// If the entry is dropped without reading it till exhaustion, the stream is tainted and +/// [`next_entry()`][Stream::next_entry()] will panic next time it is called. +pub struct Entry<'a> { + /// The kind of entry at [`relative_path`][Self::relative_path()]. + pub mode: gix_object::tree::EntryMode, + /// The hash of the object, uniquely identifying it. + pub id: gix_hash::ObjectId, + /// Access to our parent + parent: &'a mut Stream, + /// The path relative to the repository at which data should be written. + path_buf: Option, + /// The amount of bytes left to read if the size of bytes to read is known. + /// It's also our marker to say that we are depleted, which is important to signal to the + /// parent stream that we can proceed reading the next entry. + remaining: Option, +} + +/// An entry that is added to the stream by the user, verbatim, without additional worktree conversions. +/// +/// It may overwrite previously written paths, which may or may not work for the consumer of the stream. +pub struct AdditionalEntry { + /// The hash of the object, uniquely identifying it. + /// Note that it can be [`null()`][gix_hash::ObjectId::null()] as the hash is typically ignored by consumers of the stream. + pub id: gix_hash::ObjectId, + /// The kind of entry to create. + pub mode: gix_object::tree::EntryMode, + /// The path relative to the repository at which content should be located. + pub relative_path: BString, + /// Where to get the content of the entry from. + pub source: entry::Source, +} + +/// Lifecycle +impl Stream { + /// Turn ourselves into the underlying byte stream which is a representation of the underlying git tree. + /// + /// Note that the format is unspecified, and its sole use is for transport, not for persistence. + /// Can be used with [`Self::from_read()`] to decode the contained entries. + pub fn into_read(self) -> impl std::io::Read { + self.read + } + + /// Create a new instance from a stream of bytes in our format. + /// + /// It must have been created from [`Self::into_read()`] to be compatible, and must + /// not have been persisted. + pub fn from_read(read: impl std::io::Read + 'static) -> Self { + Self { + read: utils::Read::Unknown(Box::new(read)), + extra_entries: None, + path_buf: Some(Vec::with_capacity(1024).into()), + err: Default::default(), + buf: std::iter::repeat(0).take(u16::MAX as usize).collect(), + pos: 0, + filled: 0, + } + } +} + +/// Entries +impl Stream { + /// Add `entry` to the list of entries to be returned in calls to [`Self::next_entry()`]. + /// + /// The entry will be returned after the one contained in the tree, in order of addition. + /// # Panics + /// If called after the first call to [`Self::next_entry()`]. + pub fn add_entry(&mut self, entry: AdditionalEntry) -> &mut Self { + self.extra_entries + .as_ref() + .expect("BUG: must not add entries after the start of entries traversal") + .send(entry) + .expect("Failure is impossible as thread blocks on the receiving end"); + self + } + + /// Add the item at `path` as entry to this stream, which is expected to be under `root`. + /// + /// Note that the created entries will always have a null SHA1, and that we access this path + /// to determine its type, and will access it again when it is requested. + pub fn add_entry_from_path(&mut self, root: &Path, path: &Path) -> std::io::Result<&mut Self> { + let rela_path = path + .strip_prefix(root) + .map_err(|err| std::io::Error::new(std::io::ErrorKind::Other, err))?; + let meta = path.symlink_metadata()?; + let relative_path = gix_path::into_bstr(rela_path).into_owned(); + let id = gix_hash::ObjectId::null(gix_hash::Kind::Sha1); + + let entry = if meta.is_symlink() { + let content = std::fs::read_link(path)?; + let content = gix_path::into_bstr(content).into_owned(); + AdditionalEntry { + id, + mode: gix_object::tree::EntryMode::Link, + relative_path, + source: entry::Source::Memory(content.into()), + } + } else if meta.is_dir() { + AdditionalEntry { + id, + mode: gix_object::tree::EntryMode::Tree, + relative_path, + source: entry::Source::Null, + } + } else { + let mode = if gix_fs::is_executable(&meta) { + gix_object::tree::EntryMode::BlobExecutable + } else { + gix_object::tree::EntryMode::Blob + }; + AdditionalEntry { + id, + mode, + relative_path, + source: entry::Source::Path(path.to_owned()), + } + }; + Ok(self.add_entry(entry)) + } +} + +impl Stream { + pub(crate) fn new() -> ( + Stream, + gix_features::io::pipe::Writer, + std::sync::mpsc::Receiver, + ) { + let in_flight_writes = 3; // 2 = 1 write for entry header, 1 for hash, 1 for entry path + let (write, read) = gix_features::io::pipe::unidirectional(in_flight_writes); + let (tx_entries, rx_entries) = std::sync::mpsc::channel(); + ( + Stream { + read: utils::Read::Known(read), + extra_entries: Some(tx_entries), + path_buf: Some(Vec::with_capacity(1024).into()), + err: Default::default(), + buf: std::iter::repeat(0).take(u16::MAX as usize).collect(), + pos: 0, + filled: 0, + }, + write, + rx_entries, + ) + } +} + +pub(crate) mod entry; +pub(crate) mod protocol; +pub(crate) mod utils { + pub enum Read { + Known(gix_features::io::pipe::Reader), + Unknown(Box), + } + + impl std::io::Read for Read { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + match self { + Read::Known(r) => r.read(buf), + Read::Unknown(r) => r.read(buf), + } + } + } +} diff --git a/gix-archive/src/stream/protocol.rs b/gix-archive/src/stream/protocol.rs new file mode 100644 index 00000000000..f2978f8044f --- /dev/null +++ b/gix-archive/src/stream/protocol.rs @@ -0,0 +1,175 @@ +use crate::stream::{Entry, Error}; +use crate::{stream, Stream}; +use gix_object::bstr::{BStr, BString}; +use std::io::{ErrorKind, Read, Write}; + +impl Stream { + /// Access the next entry of the stream or `None` if there is nothing more to read. + pub fn next_entry(&mut self) -> Result>, Error> { + assert!( + self.path_buf.is_some(), + "BUG: must consume and drop entry before getting the next one" + ); + self.extra_entries.take(); + let res = read_entry_info( + &mut self.read, + self.path_buf.as_mut().expect("set while producing an entry"), + ); + match res { + Ok((remaining, mode, id)) => { + if let Some(err) = self.err.lock().take() { + return Err(err); + } + Ok(Some(Entry { + path_buf: self.path_buf.take(), + parent: self, + id, + mode, + remaining, + })) + } + Err(err) => { + if let Some(err) = self.err.lock().take() { + return Err(err); + } + // unexpected EOF means the other side dropped. We handled potential errors already. + if err.kind() == ErrorKind::UnexpectedEof { + return Ok(None); + } + Err(err.into()) + } + } + } +} + +// Format: [usize-LE][usize-LE][byte][byte][hash][relative_path_bytes][object_stream] +// Note that stream_len can be usize::MAX to indicate the stream size is unknown +fn read_entry_info( + read: &mut stream::utils::Read, + path_buf: &mut BString, +) -> std::io::Result<(Option, gix_object::tree::EntryMode, gix_hash::ObjectId)> { + let mut buf = [0; std::mem::size_of::() * 2 + 2]; + + read.read_exact(&mut buf)?; + let (path_len, rest) = buf.split_at(std::mem::size_of::()); + let (stream_len, bytes) = rest.split_at(std::mem::size_of::()); + let path_len = usize::from_le_bytes(path_len.try_into().expect("valid")); + let stream_size = usize::from_le_bytes(stream_len.try_into().expect("valid")); + let mode = byte_to_mode(bytes[0]); + let hash_kind = byte_to_hash(bytes[1]); + + let mut hash = hash_kind.null(); + read.read_exact(hash.as_mut_slice())?; + + clear_and_set_len(path_buf, path_len); + read.read_exact(path_buf)?; + + Ok(((stream_size != usize::MAX).then_some(stream_size), mode, hash)) +} + +/// This function must match the read-count of `read_entry_info` for max efficiency. +pub(crate) fn write_entry_header_and_path( + path: &BStr, + oid: &gix_hash::oid, + mode: gix_object::tree::EntryMode, + stream_len: Option, + out: &mut gix_features::io::pipe::Writer, +) -> std::io::Result<()> { + let mut buf = [0u8; std::mem::size_of::() * 2 + 2]; + let (path_len_buf, rest) = buf.split_at_mut(std::mem::size_of::()); + let (stream_len_buf, bytes) = rest.split_at_mut(std::mem::size_of::()); + + path_len_buf.copy_from_slice(&path.len().to_le_bytes()); + stream_len_buf.copy_from_slice(&stream_len.unwrap_or(usize::MAX).to_le_bytes()); + bytes[0] = mode_to_byte(mode); + bytes[1] = hash_to_byte(oid.kind()); + + // We know how `out` works in a pipe writer, it's always writing everything. + #[allow(clippy::unused_io_amount)] + { + out.write(&buf)?; + out.write(oid.as_bytes())?; + out.write(path)?; + } + Ok(()) +} + +/// This writes everything in `input` in such way that the receiver knows exactly how much to read. +/// The format is similar to the packetline format, but in binary. +pub(crate) fn write_stream( + buf: &mut Vec, + mut input: impl std::io::Read, + out: &mut gix_features::io::pipe::Writer, +) -> std::io::Result<()> { + const BUF_LEN: usize = u16::MAX as usize; + clear_and_set_len(buf, BUF_LEN); + + // We know how `out` works in a pipe writer, it's always writing everything. + #[allow(clippy::unused_io_amount)] + loop { + match input.read(buf) { + Ok(0) => { + // terminator + out.write(&0_u16.to_le_bytes())?; + break; + } + Ok(n) => { + out.write(&(n as u16).to_le_bytes())?; + out.write(&buf[..n])?; + } + Err(ref e) if e.kind() == ErrorKind::Interrupted => {} + Err(e) => return Err(e), + } + } + + Ok(()) +} + +fn byte_to_hash(b: u8) -> gix_hash::Kind { + match b { + 0 => gix_hash::Kind::Sha1, + _ => unreachable!("BUG: we control the protocol"), + } +} + +fn byte_to_mode(b: u8) -> gix_object::tree::EntryMode { + use gix_object::tree::EntryMode::*; + match b { + 0 => Tree, + 1 => Blob, + 2 => BlobExecutable, + 3 => Link, + 4 => Commit, + _ => unreachable!("BUG: we control the protocol"), + } +} + +fn hash_to_byte(h: gix_hash::Kind) -> u8 { + match h { + gix_hash::Kind::Sha1 => 0, + } +} + +fn mode_to_byte(m: gix_object::tree::EntryMode) -> u8 { + use gix_object::tree::EntryMode::*; + match m { + Tree => 0, + Blob => 1, + BlobExecutable => 2, + Link => 3, + Commit => 4, + } +} + +fn clear_and_set_len(buf: &mut Vec, len: usize) { + buf.clear(); + if buf.capacity() < len { + buf.reserve(len); + assert!(buf.capacity() >= len, "{} >= {}", buf.capacity(), len); + } + // SAFETY: we just assured that `buf` has the right capacity to hold `cap` + #[allow(unsafe_code)] + unsafe { + buf.set_len(len); + } +} diff --git a/gix-archive/src/write.rs b/gix-archive/src/write.rs deleted file mode 100644 index 40c664d91ce..00000000000 --- a/gix-archive/src/write.rs +++ /dev/null @@ -1,37 +0,0 @@ -use crate::Stream; -use gix_object::bstr::BStr; - -/// Use `find` to traverse `tree` and fetch the contained blobs to return as [`Stream`], which makes them queryable -/// on demand with support for streaming each entry. -/// `pipeline` is used to convert blobs to their worktree representation, and `attributes` is used to read -/// the `export-ignore` attribute. If set on a directory or blob, it won't be added to the archive. -/// -/// ### Progress and interruptions -/// -/// For per-file progress, integrate progress handling into `find` as it is called for trees and blobs. -/// `find` should also be used for interrupt handling, as it can return an error once per file. -/// For progress on bytes-written, integrate progress reporting when consuming the stream. -/// Further it's possible to drop the returned [`Stream`] to halt all operation. -/// -/// ### Threaded Operation -/// -/// This function spawns a thread that will access the tree data in the background, synchronized through -/// `Stream` so that it will not be faster than the consumer, with at most one file in flight at any time. -/// -/// ### Limitations -/// -/// * `export-subst` is not support, as it requires the entire formatting engine of `git log`. -pub fn write_to( - _tree: gix_hash::ObjectId, - mut _find: Find, - _pipeline: &mut gix_filter::Pipeline, - _attributes: impl FnMut(&BStr, &mut gix_attributes::search::Outcome) -> Result<(), E2> + Send, -) -> Stream -where - W: std::io::Write, - Find: for<'a> FnMut(&gix_hash::oid, &'a mut Vec) -> Result, E1> + Send, - E1: std::error::Error + Send + Sync + 'static, - E2: std::error::Error + Send + Sync + 'static, -{ - todo!() -} diff --git a/gix-archive/src/write/mod.rs b/gix-archive/src/write/mod.rs new file mode 100644 index 00000000000..9c3c2f4c67a --- /dev/null +++ b/gix-archive/src/write/mod.rs @@ -0,0 +1,151 @@ +use crate::stream::{Error, SharedErrorSlot}; +use crate::{stream, Stream}; +use gix_object::bstr::BStr; +use std::io::Write; + +/// Use `find` to traverse `tree` and fetch the contained blobs to return as [`Stream`], which makes them queryable +/// on demand with support for streaming each entry. +/// `pipeline` is used to convert blobs to their worktree representation, and `attributes` is used to read +/// the `export-ignore` attribute. If set on a directory or blob, it won't be added to the archive. +/// +/// ### Types of entries in stream +/// +/// We only return blobs (with or without executable), which may be symlinks in which case their content will +/// be target of the symlink. +/// Directories are never returned, but maybe added by the caller via [Stream::add_entry()]. +/// +/// ### Progress and interruptions +/// +/// For per-file progress, integrate progress handling into the calls of [`Stream::next_entry()`] as that +/// correlates blobs. +/// Additional interrupt handling can be wrapped around the `Read` implementation of each [`stream::Entry`]. +/// For progress on bytes-written, integrate progress reporting when consuming the stream. +/// Further it's possible to drop the returned [`Stream`] to halt all operation. +/// +/// ### Threaded Operation +/// +/// This function spawns a thread that will access the tree data in the background, synchronized through +/// `Stream` so that it will not be faster than the consumer, with at most one file in flight at any time. +/// +/// ### Limitations +/// +/// * `export-subst` is not support, as it requires the entire formatting engine of `git log`. +pub fn write_to( + tree: gix_hash::ObjectId, + find: Find, + pipeline: gix_filter::Pipeline, + attributes: impl FnMut(&BStr, gix_object::tree::EntryMode, &mut gix_attributes::search::Outcome) -> Result<(), E2> + + Send + + 'static, +) -> Stream +where + Find: for<'a> FnMut(&gix_hash::oid, &'a mut Vec) -> Result, E1> + Clone + Send + 'static, + E1: std::error::Error + Send + Sync + 'static, + E2: std::error::Error + Send + Sync + 'static, +{ + let (stream, mut write, additional_entries) = Stream::new(); + std::thread::spawn({ + let slot = stream.err.clone(); + move || { + if let Err(err) = run( + tree, + find, + pipeline, + attributes, + &mut write, + slot.clone(), + additional_entries, + ) { + { + let mut slot = slot.lock(); + if slot.is_none() { + *slot = Some(err); + } else { + drop(slot); + write + .channel + .send(Err(std::io::Error::new(std::io::ErrorKind::Other, err))) + .ok(); + } + } + } + } + }); + stream +} + +fn run( + tree: gix_hash::ObjectId, + mut find: Find, + pipeline: gix_filter::Pipeline, + mut attributes: impl FnMut(&BStr, gix_object::tree::EntryMode, &mut gix_attributes::search::Outcome) -> Result<(), E2> + + Send + + 'static, + out: &mut gix_features::io::pipe::Writer, + err: SharedErrorSlot, + additional_entries: std::sync::mpsc::Receiver, +) -> Result<(), Error> +where + Find: for<'a> FnMut(&gix_hash::oid, &'a mut Vec) -> Result, E1> + Clone + Send + 'static, + E1: std::error::Error + Send + Sync + 'static, + E2: std::error::Error + Send + Sync + 'static, +{ + let mut buf = Vec::new(); + let obj = find(tree.as_ref(), &mut buf).map_err(|err| Error::Find(Box::new(err)))?; + let tree = gix_object::TreeRefIter::from_bytes(obj.data); + + let mut attrs = gix_attributes::search::Outcome::default(); + attrs.initialize_with_selection(&Default::default(), Some("export-ignore")); + let mut dlg = traverse::Delegate { + out, + err, + pipeline, + attrs, + find: { + let mut find = find.clone(); + move |a: &gix_hash::oid, b: &mut Vec| find(a, b).map_err(|err| stream::Error::Find(Box::new(err))) + }, + fetch_attributes: move |a: &BStr, b: gix_object::tree::EntryMode, c: &mut gix_attributes::search::Outcome| { + attributes(a, b, c).map_err(|err| stream::Error::Attributes { + source: Box::new(err), + path: a.to_owned(), + }) + }, + path_deque: Default::default(), + path: Default::default(), + buf: Vec::with_capacity(1024), + }; + gix_traverse::tree::breadthfirst( + tree, + gix_traverse::tree::breadthfirst::State::default(), + |id, buf| { + find(id, buf) + .map(|obj| gix_object::TreeRefIter::from_bytes(obj.data)) + .ok() + }, + &mut dlg, + )?; + + for entry in additional_entries { + stream::protocol::write_entry_header_and_path( + entry.relative_path.as_ref(), + &entry.id, + entry.mode, + entry.source.len(), + out, + )?; + // pipe writer always writes all in one go. + #[allow(clippy::unused_io_amount)] + match entry.source { + stream::entry::Source::Memory(buf) => out.write(&buf).map(|_| ()), + stream::entry::Source::Null => out.write(&[]).map(|_| ()), + stream::entry::Source::Path(path) => { + let file = std::fs::File::open(path)?; + stream::protocol::write_stream(&mut buf, file, out) + } + }? + } + Ok(()) +} + +mod traverse; diff --git a/gix-archive/src/write/traverse.rs b/gix-archive/src/write/traverse.rs new file mode 100644 index 00000000000..78e555509b8 --- /dev/null +++ b/gix-archive/src/write/traverse.rs @@ -0,0 +1,153 @@ +use crate::stream; +use crate::stream::SharedErrorSlot; +use gix_filter::driver::apply::MaybeDelayed; +use gix_filter::pipeline::convert::ToWorktreeOutcome; +use gix_object::bstr::{BStr, BString, ByteSlice, ByteVec}; +use gix_object::tree; +use gix_traverse::tree::visit::Action; +use gix_traverse::tree::Visit; +use std::collections::VecDeque; +use std::io::Write; + +pub struct Delegate<'a, AttributesFn, FindFn> +where + FindFn: for<'b> FnMut(&gix_hash::oid, &'b mut Vec) -> Result, stream::Error> + 'static, +{ + pub(crate) out: &'a mut gix_features::io::pipe::Writer, + pub(crate) err: SharedErrorSlot, + pub(crate) path_deque: VecDeque, + pub(crate) path: BString, + pub(crate) pipeline: gix_filter::Pipeline, + pub(crate) attrs: gix_attributes::search::Outcome, + pub(crate) fetch_attributes: AttributesFn, + pub(crate) find: FindFn, + pub(crate) buf: Vec, +} + +impl Delegate<'_, AttributesFn, FindFn> +where + FindFn: for<'b> FnMut(&gix_hash::oid, &'b mut Vec) -> Result, stream::Error> + 'static, + AttributesFn: FnMut(&BStr, gix_object::tree::EntryMode, &mut gix_attributes::search::Outcome) -> Result<(), stream::Error> + + 'static, +{ + fn pop_element(&mut self) { + if let Some(pos) = self.path.rfind_byte(b'/') { + self.path.resize(pos, 0); + } else { + self.path.clear(); + } + } + + fn push_element(&mut self, name: &BStr) { + if !self.path.is_empty() { + self.path.push(b'/'); + } + self.path.push_str(name); + } + /// Return the state of the `export-ignore` attribute. + fn ignore_state(&self) -> gix_attributes::StateRef<'_> { + self.attrs + .iter_selected() + .next() + .expect("initialized with one attr") + .assignment + .state + } + + fn handle_entry(&mut self, entry: &tree::EntryRef<'_>) -> Result { + if !entry.mode.is_blob_or_symlink() { + return Ok(Action::Continue); + } + (self.fetch_attributes)(self.path.as_ref(), entry.mode, &mut self.attrs)?; + if self.ignore_state().is_set() { + return Ok(Action::Continue); + } + (self.find)(entry.oid, &mut self.buf)?; + + let converted = self.pipeline.convert_to_worktree( + &self.buf, + self.path.as_ref(), + |a, b| { + (self.fetch_attributes)(a, entry.mode, b).ok(); + }, + gix_filter::driver::apply::Delay::Forbid, + )?; + + // Our pipe writer always writes the whole amount. + #[allow(clippy::unused_io_amount)] + match converted { + ToWorktreeOutcome::Unchanged(buf) | ToWorktreeOutcome::Buffer(buf) => { + stream::protocol::write_entry_header_and_path( + self.path.as_ref(), + entry.oid, + entry.mode, + Some(buf.len()), + self.out, + )?; + self.out.write(buf)?; + } + ToWorktreeOutcome::Process(MaybeDelayed::Immediate(read)) => { + stream::protocol::write_entry_header_and_path( + self.path.as_ref(), + entry.oid, + entry.mode, + None, + self.out, + )?; + stream::protocol::write_stream(&mut self.buf, read, self.out)?; + } + ToWorktreeOutcome::Process(MaybeDelayed::Delayed(_)) => { + unreachable!("we forbade it") + } + } + Ok(Action::Continue) + } +} + +impl Visit for Delegate<'_, AttributesFn, FindFn> +where + FindFn: for<'a> FnMut(&gix_hash::oid, &'a mut Vec) -> Result, stream::Error> + 'static, + AttributesFn: FnMut(&BStr, gix_object::tree::EntryMode, &mut gix_attributes::search::Outcome) -> Result<(), stream::Error> + + 'static, +{ + fn pop_front_tracked_path_and_set_current(&mut self) { + self.path = self + .path_deque + .pop_front() + .expect("every call is matched with push_tracked_path_component"); + } + + fn push_back_tracked_path_component(&mut self, component: &BStr) { + self.push_element(component); + self.path_deque.push_back(self.path.clone()); + } + + fn push_path_component(&mut self, component: &BStr) { + self.push_element(component); + } + + fn pop_path_component(&mut self) { + self.pop_element() + } + + fn visit_tree(&mut self, entry: &tree::EntryRef<'_>) -> Action { + if let Err(err) = (self.fetch_attributes)(self.path.as_ref(), entry.mode, &mut self.attrs) { + *self.err.lock() = Some(err); + Action::Cancel + } else if self.ignore_state().is_set() { + Action::Skip + } else { + Action::Continue + } + } + + fn visit_nontree(&mut self, entry: &tree::EntryRef<'_>) -> Action { + match self.handle_entry(entry) { + Ok(action) => action, + Err(err) => { + *self.err.lock() = Some(err); + Action::Cancel + } + } + } +} diff --git a/gix-archive/tests/archive.rs b/gix-archive/tests/archive.rs index 8b137891791..cfe83342d7d 100644 --- a/gix-archive/tests/archive.rs +++ b/gix-archive/tests/archive.rs @@ -1 +1,295 @@ +/// Convert a hexadecimal hash into its corresponding `ObjectId` or _panic_. +fn hex_to_id(hex: &str) -> gix_hash::ObjectId { + gix_hash::ObjectId::from_hex(hex.as_bytes()).expect("40 bytes hex") +} +mod write_to { + use crate::hex_to_id; + use gix_archive::write_to; + use gix_attributes::glob::pattern::Case; + use gix_object::bstr::ByteSlice; + use gix_object::tree::EntryMode; + use gix_odb::FindExt; + use gix_testtools::once_cell::sync::Lazy; + use gix_worktree::cache::state::attributes::Source; + use std::convert::Infallible; + use std::io::{Error, ErrorKind, Read, Write}; + use std::path::PathBuf; + use std::sync::Arc; + + #[test] + fn can_receive_err_if_root_is_not_found() { + let mut stream = write_to( + gix_hash::Kind::Sha1.null(), + |_, _| Err(Error::new(ErrorKind::Other, "object retrieval failed")), + mutating_pipeline(false), + |_, _, _| -> Result<_, Infallible> { unreachable!("must not be called") }, + ); + let err = stream.next_entry().unwrap_err(); + assert_eq!(err.to_string(), "Could not find a blob or tree for archival"); + } + + #[test] + fn can_receive_err_if_attribute_not_found() -> gix_testtools::Result { + let (_dir, head_tree, odb, _cache) = basic()?; + let mut stream = write_to( + head_tree, + move |id, buf| odb.find(id, buf), + mutating_pipeline(false), + |_, _, _| Err(Error::new(ErrorKind::Other, "attribute retrieval failed")), + ); + let err = stream.next_entry().unwrap_err(); + assert_eq!( + err.to_string(), + "Could not query attributes for path \".gitattributes\"" + ); + Ok(()) + } + + #[test] + fn will_provide_all_information_and_respect_export_ignore() -> gix_testtools::Result { + let (dir, head_tree, odb, mut cache) = basic()?; + let mut stream = write_to( + head_tree, + { + let odb = odb.clone(); + move |id, buf| odb.find(id, buf) + }, + mutating_pipeline(true), + move |rela_path, mode, attrs| { + cache + .at_entry(rela_path, mode.is_tree().into(), |id, buf| odb.find_blob(id, buf)) + .map(|entry| entry.matching_attributes(attrs)) + .map(|_| ()) + }, + ); + stream + .add_entry_from_path(&dir, &dir.join("extra-file"))? + .add_entry_from_path(&dir, &dir.join("extra-exe"))? + .add_entry_from_path(&dir, &dir.join("extra-dir-empty"))? + .add_entry_from_path(&dir, &dir.join("extra-dir").join("symlink-to-extra"))?; + + let tee_read = TeeToMemory { + read: stream.into_read(), + write: Default::default(), + }; + let copy = tee_read.write.clone(); + let mut paths_and_modes = Vec::new(); + let mut stream = gix_archive::Stream::from_read(tee_read); + + while let Some(mut entry) = stream.next_entry().expect("entry retrieval does not fail") { + paths_and_modes.push((entry.relative_path().to_owned(), entry.mode, entry.id)); + let mut buf = Vec::new(); + entry.read_to_end(&mut buf).expect("stream can always be read"); + if !buf.is_empty() && entry.mode.is_blob() { + if entry.relative_path().contains_str("extra") { + assert!( + buf.find_byte(b'\r').is_none(), + "extra-files are not processed in any way" + ); + } else { + assert!( + buf.find_byte(b'\r').is_some(), + "'{}' did not contain a carriage return as sign of having been filtered", + buf.as_bstr() + ); + if entry.relative_path().ends_with_str(b"streamed") { + assert_eq!(buf.as_bstr(), "➡streamed-by-driver\r\n"); + } + } + } + } + + let expected_extra_exe_mode = if cfg!(windows) { + EntryMode::Blob + } else { + EntryMode::BlobExecutable + }; + assert_eq!( + paths_and_modes, + &[ + ( + ".gitattributes".into(), + EntryMode::Blob, + hex_to_id("45c160c35c17ad264b96431cceb9793160396e99") + ), + ( + "a".into(), + EntryMode::Blob, + hex_to_id("45b983be36b73c0788dc9cbcb76cbb80fc7bb057") + ), + ( + "symlink-to-a".into(), + EntryMode::Link, + hex_to_id("2e65efe2a145dda7ee51d1741299f848e5bf752e") + ), + ( + "dir/.gitattributes".into(), + EntryMode::Blob, + hex_to_id("81b9a375276405703e05be6cecf0fc1c8b8eed64") + ), + ( + "dir/b".into(), + EntryMode::Blob, + hex_to_id("ab4a98190cf776b43cb0fe57cef231fb93fd07e6") + ), + ( + "dir/subdir/exe".into(), + EntryMode::BlobExecutable, + hex_to_id("e69de29bb2d1d6434b8b29ae775ad8c2e48c5391") + ), + ( + "dir/subdir/streamed".into(), + EntryMode::Blob, + hex_to_id("08991f58f4de5d85b61c0f87f3ac053c79d0e739") + ), + ( + "extra-file".into(), + EntryMode::Blob, + hex_to_id("0000000000000000000000000000000000000000") + ), + ( + "extra-exe".into(), + expected_extra_exe_mode, + hex_to_id("0000000000000000000000000000000000000000") + ), + ( + "extra-dir-empty".into(), + EntryMode::Tree, + hex_to_id("0000000000000000000000000000000000000000") + ), + ( + "extra-dir/symlink-to-extra".into(), + EntryMode::Link, + hex_to_id("0000000000000000000000000000000000000000") + ) + ] + ); + assert_eq!( + copy.lock().len(), + 710, + "keep track of file size changes of the streaming format" + ); + + let mut copied_stream = gix_archive::Stream::from_read(std::io::Cursor::new(copy.lock().as_bytes().to_owned())); + let mut copied_paths_and_modes = Vec::new(); + let mut buf = Vec::new(); + while let Some(mut entry) = copied_stream.next_entry().expect("entry retrieval does not fail") { + copied_paths_and_modes.push((entry.relative_path().to_owned(), entry.mode, entry.id)); + buf.clear(); + entry.read_to_end(&mut buf).expect("stream can always be read"); + } + assert_eq!( + copied_paths_and_modes, paths_and_modes, + "a stream copy yields exactly the same result" + ); + Ok(()) + } + + #[test] + fn can_drop_entry_without_reading_it() -> gix_testtools::Result { + let (_dir, head_tree, odb, mut cache) = basic()?; + let mut stream = write_to( + head_tree, + { + let odb = odb.clone(); + move |id, buf| odb.find(id, buf) + }, + mutating_pipeline(false), + move |rela_path, mode, attrs| { + cache + .at_entry(rela_path, mode.is_tree().into(), |id, buf| odb.find_blob(id, buf)) + .map(|entry| entry.matching_attributes(attrs)) + .map(|_| ()) + }, + ); + + drop(stream.next_entry().expect("entry retrieval does not fail")); + Ok(()) + } + + fn basic() -> gix_testtools::Result<(PathBuf, gix_hash::ObjectId, gix_odb::HandleArc, gix_worktree::Cache)> { + let dir = gix_testtools::scripted_fixture_read_only("basic.sh")?; + + let head = { + let hex = std::fs::read(dir.join("head.hex"))?; + gix_hash::ObjectId::from_hex(hex.trim())? + }; + let odb = gix_odb::at(dir.join(".git").join("objects"))?; + + let mut collection = Default::default(); + let mut buf = Default::default(); + let attributes = gix_worktree::cache::state::Attributes::new( + gix_attributes::Search::new_globals(None::, &mut buf, &mut collection)?, + None, + Source::WorktreeThenIdMapping, + collection, + ); + let state = gix_worktree::cache::State::AttributesStack(attributes); + let cache = gix_worktree::Cache::new(&dir, state, Case::Sensitive, Default::default(), Default::default()); + Ok((dir, head, odb.into_arc()?, cache)) + } + + fn mutating_pipeline(driver: bool) -> gix_filter::Pipeline { + gix_filter::Pipeline::new( + &Default::default(), + gix_filter::pipeline::Options { + drivers: if driver { vec![driver_with_process()] } else { vec![] }, + eol_config: gix_filter::eol::Configuration { + auto_crlf: gix_filter::eol::AutoCrlf::Enabled, + ..Default::default() + }, + ..Default::default() + }, + ) + } + + pub(crate) fn driver_with_process() -> gix_filter::Driver { + let mut exe = DRIVER.to_string_lossy().into_owned(); + if cfg!(windows) { + exe = exe.replace('\\', "/"); + } + gix_filter::Driver { + name: "arrow".into(), + clean: None, + smudge: None, + process: Some((exe + " process").into()), + required: true, + } + } + + static DRIVER: Lazy = Lazy::new(|| { + let mut cargo = std::process::Command::new(env!("CARGO")); + let res = cargo + .args(["build", "-p=gix-filter", "--example", "arrow"]) + .status() + .expect("cargo should run fine"); + assert!(res.success(), "cargo invocation should be successful"); + + let path = PathBuf::from(env!("CARGO_TARGET_TMPDIR")) + .ancestors() + .nth(1) + .expect("first parent in target dir") + .join("debug") + .join("examples") + .join(if cfg!(windows) { "arrow.exe" } else { "arrow" }); + assert!(path.is_file(), "Expecting driver to be located at {path:?}"); + path + }); + + struct TeeToMemory { + read: R, + write: Arc>>, + } + + impl std::io::Read for TeeToMemory + where + R: std::io::Read, + { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + let nb = self.read.read(buf)?; + self.write.lock().write_all(&buf[..nb])?; + Ok(nb) + } + } +} diff --git a/gix-archive/tests/fixtures/basic.sh b/gix-archive/tests/fixtures/basic.sh new file mode 100644 index 00000000000..9aad15f089d --- /dev/null +++ b/gix-archive/tests/fixtures/basic.sh @@ -0,0 +1,31 @@ +set -eu -o pipefail + +git init + +mkdir dir-ignored +touch dir-ignored/file-ignored-transitively +touch file-ignored + +echo "hi" > a +mkdir dir +echo "ho" > dir/b +mkdir dir/subdir +echo "subdir/streamed filter=arrow" > dir/.gitattributes +echo "streamed-by-driver" > dir/subdir/streamed +touch dir/subdir/exe +chmod +x dir/subdir/exe +ln -s a symlink-to-a + +echo "/dir-ignored/ export-ignore" > .gitattributes +echo "/file-ignored export-ignore" >> .gitattributes + +git add . +git commit -m "init" + +echo "extra" > extra-file +touch extra-exe && chmod +x extra-exe +mkdir extra-dir-empty extra-dir +ln -s ../extra-file extra-dir/symlink-to-extra + +git rev-parse @^{tree} > head.hex + diff --git a/gix-archive/tests/fixtures/generated-archives/basic.tar.xz b/gix-archive/tests/fixtures/generated-archives/basic.tar.xz new file mode 100644 index 00000000000..91d8d229b40 --- /dev/null +++ b/gix-archive/tests/fixtures/generated-archives/basic.tar.xz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:939077297db2dbe9917633082205c063bcd35e890b2877d530f2e0f4a75d7e3f +size 11544 From 9a157ae0a4f649dacd911ccfb50facd942b992f0 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 20 Jul 2023 14:45:16 +0200 Subject: [PATCH 08/18] Create the new `gix-worktree-stream` crate from what was `gix-archive`. Also, restore `gix-archive` to the previous state. --- Cargo.lock | 32 +- Cargo.toml | 1 + README.md | 1 + crate-status.md | 10 + etc/check-package-size.sh | 1 + gix-archive/Cargo.toml | 16 +- gix-archive/src/lib.rs | 82 +++-- gix-archive/src/write.rs | 7 + gix-archive/tests/archive.rs | 294 ----------------- .../fixtures/generated-archives/basic.tar.xz | 3 - gix-worktree-stream/Cargo.toml | 30 ++ gix-worktree-stream/LICENSE-APACHE | 1 + gix-worktree-stream/LICENSE-MIT | 1 + .../src}/entry.rs | 73 ++++- .../src/from_tree}/mod.rs | 31 +- .../src/from_tree}/traverse.rs | 33 +- .../mod.rs => gix-worktree-stream/src/lib.rs | 69 ++-- .../src}/protocol.rs | 53 +-- gix-worktree-stream/tests/fixtures/basic.sh | 34 ++ .../fixtures/generated-archives/basic.tar.xz | 3 + gix-worktree-stream/tests/stream.rs | 306 ++++++++++++++++++ 21 files changed, 624 insertions(+), 457 deletions(-) create mode 100644 gix-archive/src/write.rs delete mode 100644 gix-archive/tests/fixtures/generated-archives/basic.tar.xz create mode 100644 gix-worktree-stream/Cargo.toml create mode 120000 gix-worktree-stream/LICENSE-APACHE create mode 120000 gix-worktree-stream/LICENSE-MIT rename {gix-archive/src/stream => gix-worktree-stream/src}/entry.rs (54%) rename {gix-archive/src/write => gix-worktree-stream/src/from_tree}/mod.rs (86%) rename {gix-archive/src/write => gix-worktree-stream/src/from_tree}/traverse.rs (82%) rename gix-archive/src/stream/mod.rs => gix-worktree-stream/src/lib.rs (75%) rename {gix-archive/src/stream => gix-worktree-stream/src}/protocol.rs (72%) create mode 100644 gix-worktree-stream/tests/fixtures/basic.sh create mode 100644 gix-worktree-stream/tests/fixtures/generated-archives/basic.tar.xz create mode 100644 gix-worktree-stream/tests/stream.rs diff --git a/Cargo.lock b/Cargo.lock index 91f622e6799..e2bbc5a47d2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1398,18 +1398,7 @@ dependencies = [ name = "gix-archive" version = "0.2.0" dependencies = [ - "gix-attributes 0.15.0", - "gix-features 0.32.0", - "gix-filter", - "gix-fs 0.4.0", - "gix-hash 0.11.3", - "gix-object 0.33.0", - "gix-odb", - "gix-path 0.8.3", - "gix-testtools", - "gix-traverse 0.30.0", - "gix-worktree 0.22.0", - "parking_lot", + "gix-worktree-stream", "thiserror", ] @@ -2556,6 +2545,25 @@ dependencies = [ "walkdir", ] +[[package]] +name = "gix-worktree-stream" +version = "0.2.0" +dependencies = [ + "gix-attributes 0.15.0", + "gix-features 0.32.0", + "gix-filter", + "gix-fs 0.4.0", + "gix-hash 0.11.3", + "gix-object 0.33.0", + "gix-odb", + "gix-path 0.8.3", + "gix-testtools", + "gix-traverse 0.30.0", + "gix-worktree 0.22.0", + "parking_lot", + "thiserror", +] + [[package]] name = "gloo-timers" version = "0.2.6" diff --git a/Cargo.toml b/Cargo.toml index 31ae31f23fb..256e87c7671 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -265,6 +265,7 @@ members = [ "gix-tui", "gix-tix", "gix-archive", + "gix-worktree-stream", "gix-revwalk", "cargo-smart-release", diff --git a/README.md b/README.md index 6198b9bb92c..8e32b429381 100644 --- a/README.md +++ b/README.md @@ -91,6 +91,7 @@ is usable to some extent. * [gix-worktree](https://github.com/Byron/gitoxide/blob/main/crate-status.md#gix-worktree) * [gix-bitmap](https://github.com/Byron/gitoxide/blob/main/crate-status.md#gix-bitmap) * [gix-negotiate](https://github.com/Byron/gitoxide/blob/main/crate-status.md#gix-negotiate) + * [gix-worktree-stream](https://github.com/Byron/gitoxide/blob/main/crate-status.md#gix-worktree-stream) * `gitoxide-core` * **very early** _(possibly without any documentation and many rough edges)_ * [gix-date](https://github.com/Byron/gitoxide/blob/main/crate-status.md#gix-date) diff --git a/crate-status.md b/crate-status.md index d92ac2a4574..f8a4e87d968 100644 --- a/crate-status.md +++ b/crate-status.md @@ -724,6 +724,16 @@ See its [README.md](https://github.com/Byron/gitoxide/blob/main/gix-lock/README. * [ ] API documentation * [ ] Some examples +### gix-worktree-stream + +* [x] encode git-tree as stream of bytes (with large file support and actual streaming) +* [x] decode bytes into entries +* [x] add custom entries to the stream +* [x] respect `export-ignore` git attribute +* [x] apply standard worktree conversion to simulate an actual checkout +* [x] API documentation + * [ ] Some examples + ### gix-archive * [ ] `write_to()` for creating an archive with various container formats diff --git a/etc/check-package-size.sh b/etc/check-package-size.sh index e572c40f415..80839dad988 100755 --- a/etc/check-package-size.sh +++ b/etc/check-package-size.sh @@ -18,6 +18,7 @@ echo "in root: gitoxide CLI" (enter cargo-smart-release && indent cargo diet -n --package-size-limit 110KB) (enter gix-actor && indent cargo diet -n --package-size-limit 5KB) (enter gix-archive && indent cargo diet -n --package-size-limit 10KB) +(enter gix-worktree-stream && indent cargo diet -n --package-size-limit 40KB) (enter gix-utils && indent cargo diet -n --package-size-limit 10KB) (enter gix-fs && indent cargo diet -n --package-size-limit 10KB) (enter gix-pathspec && indent cargo diet -n --package-size-limit 30KB) diff --git a/gix-archive/Cargo.toml b/gix-archive/Cargo.toml index 8c25e4b036c..fed7d899c0d 100644 --- a/gix-archive/Cargo.toml +++ b/gix-archive/Cargo.toml @@ -12,19 +12,5 @@ rust-version = "1.65" doctest = false [dependencies] -gix-features = { version = "^0.32.0", path = "../gix-features", features = ["progress", "io-pipe"] } -gix-hash = { version = "^0.11.3", path = "../gix-hash" } -gix-object = { version = "^0.33.0", path = "../gix-object" } -gix-attributes = { version = "^0.15.0", path = "../gix-attributes" } -gix-filter = { version = "^0.1.0", path = "../gix-filter" } -gix-traverse = { version = "^0.30.0", path = "../gix-traverse" } -gix-fs = { version = "^0.4.0", path = "../gix-fs" } -gix-path = { version = "^0.8.3", path = "../gix-path" } - thiserror = "1.0.26" -parking_lot = "0.12.1" - -[dev-dependencies] -gix-testtools = { path = "../tests/tools"} -gix-odb = { path = "../gix-odb"} -gix-worktree = { path = "../gix-worktree"} +gix-worktree-stream = { version = "^0.2.0", path = "../gix-worktree-stream" } diff --git a/gix-archive/src/lib.rs b/gix-archive/src/lib.rs index 7fae7820eb6..0b229507f43 100644 --- a/gix-archive/src/lib.rs +++ b/gix-archive/src/lib.rs @@ -1,30 +1,62 @@ -//! The implementation of creating an archive from a git tree, similar to `git archive`, but using an internal format. -//! -//! This crate can effectively be used to manipulate worktrees as streams of bytes, which can be decoded using the [`Stream`] type. -#![deny(rust_2018_idioms, missing_docs, unsafe_code)] +//! The implementation of creating an archive from a git tree, similar to `git archive`. +#![deny(rust_2018_idioms, missing_docs)] +#![forbid(unsafe_code)] -use gix_object::bstr::BString; +/// The error returned by [`write_to()`]. +#[derive(Debug, thiserror::Error)] +#[allow(missing_docs)] +pub enum Error { + #[error(transparent)] + Io(#[from] std::io::Error), +} -/// -pub mod stream; +/// The supported container formats for use in [`write_to()`]. +#[derive(Default, PartialEq, Eq, Copy, Clone, Debug)] +pub enum Format { + /// An internal format that is suitable only for intra-process communication. + /// + /// It is provided here as a basis available without extra dependencies, and as a debugging tool. + #[default] + InternalTransientNonPersistable, + /// A standard `tar` archive. + /// + /// Use it as well if a custom container format is desired. The idea is to decode it on a separate thread + /// to rewrite the data to the desired format. + Tar, + /// A convenience format that will `zip` deflate the `tar` stream. + TarGz { + /// The compression level to use for the `zlib` compression, ranging from 0 (no compression) to 9 (best compression). + compression_level: u8, + }, + /// Use the zip` container format, instead of `tar`, provided for convenience. + Zip { + /// The compression level to use for the `zlib` compression, ranging from 0 (no compression) to 9 (best compression). + compression_level: u8, + }, +} -mod write; -pub use write::write_to; +/// Options for configuring [`write_to()`]. +#[derive(Clone, Debug)] +pub struct Options { + /// The archive's format. + pub format: Format, + /// Given a `path`, originating in the git tree, to place into the archive, put `/path` in front of it. + pub tree_prefix: Option, + /// The modification time for all entries in the archive. + /// + /// Defaults to the current time. The caller may set this to the commit time if available. + pub modification_time: std::time::SystemTime, +} -/// A stream of entries that originate from a git tree and optionally from additional entries. -/// -/// Note that a git tree is mandatory, but the empty tree can be used to effectively disable it. -pub struct Stream { - read: stream::utils::Read, - err: stream::SharedErrorSlot, - extra_entries: Option>, - // additional_entries: Vec, - /// `None` if currently held by an entry. - path_buf: Option, - /// Another buffer to partially act like a buf-reader. - buf: Vec, - /// The offset into `buf` for entries being able to act like a buf reader. - pos: usize, - /// The amount of bytes usable from `buf` (even though it always has a fixed size) - filled: usize, +impl Default for Options { + fn default() -> Self { + Options { + format: Default::default(), + tree_prefix: None, + modification_time: std::time::SystemTime::now(), + } + } } + +mod write; +pub use write::write_to; diff --git a/gix-archive/src/write.rs b/gix-archive/src/write.rs new file mode 100644 index 00000000000..b4dce740099 --- /dev/null +++ b/gix-archive/src/write.rs @@ -0,0 +1,7 @@ +use crate::{Error, Options}; +use gix_worktree_stream::Stream; + +/// Write the worktree `stream` to `out` configured according to `opts`. +pub fn write_to(_stream: &mut Stream, mut _out: impl std::io::Write, _opts: Options) -> Result<(), Error> { + Ok(()) +} diff --git a/gix-archive/tests/archive.rs b/gix-archive/tests/archive.rs index cfe83342d7d..8b137891791 100644 --- a/gix-archive/tests/archive.rs +++ b/gix-archive/tests/archive.rs @@ -1,295 +1 @@ -/// Convert a hexadecimal hash into its corresponding `ObjectId` or _panic_. -fn hex_to_id(hex: &str) -> gix_hash::ObjectId { - gix_hash::ObjectId::from_hex(hex.as_bytes()).expect("40 bytes hex") -} -mod write_to { - use crate::hex_to_id; - use gix_archive::write_to; - use gix_attributes::glob::pattern::Case; - use gix_object::bstr::ByteSlice; - use gix_object::tree::EntryMode; - use gix_odb::FindExt; - use gix_testtools::once_cell::sync::Lazy; - use gix_worktree::cache::state::attributes::Source; - use std::convert::Infallible; - use std::io::{Error, ErrorKind, Read, Write}; - use std::path::PathBuf; - use std::sync::Arc; - - #[test] - fn can_receive_err_if_root_is_not_found() { - let mut stream = write_to( - gix_hash::Kind::Sha1.null(), - |_, _| Err(Error::new(ErrorKind::Other, "object retrieval failed")), - mutating_pipeline(false), - |_, _, _| -> Result<_, Infallible> { unreachable!("must not be called") }, - ); - let err = stream.next_entry().unwrap_err(); - assert_eq!(err.to_string(), "Could not find a blob or tree for archival"); - } - - #[test] - fn can_receive_err_if_attribute_not_found() -> gix_testtools::Result { - let (_dir, head_tree, odb, _cache) = basic()?; - let mut stream = write_to( - head_tree, - move |id, buf| odb.find(id, buf), - mutating_pipeline(false), - |_, _, _| Err(Error::new(ErrorKind::Other, "attribute retrieval failed")), - ); - let err = stream.next_entry().unwrap_err(); - assert_eq!( - err.to_string(), - "Could not query attributes for path \".gitattributes\"" - ); - Ok(()) - } - - #[test] - fn will_provide_all_information_and_respect_export_ignore() -> gix_testtools::Result { - let (dir, head_tree, odb, mut cache) = basic()?; - let mut stream = write_to( - head_tree, - { - let odb = odb.clone(); - move |id, buf| odb.find(id, buf) - }, - mutating_pipeline(true), - move |rela_path, mode, attrs| { - cache - .at_entry(rela_path, mode.is_tree().into(), |id, buf| odb.find_blob(id, buf)) - .map(|entry| entry.matching_attributes(attrs)) - .map(|_| ()) - }, - ); - stream - .add_entry_from_path(&dir, &dir.join("extra-file"))? - .add_entry_from_path(&dir, &dir.join("extra-exe"))? - .add_entry_from_path(&dir, &dir.join("extra-dir-empty"))? - .add_entry_from_path(&dir, &dir.join("extra-dir").join("symlink-to-extra"))?; - - let tee_read = TeeToMemory { - read: stream.into_read(), - write: Default::default(), - }; - let copy = tee_read.write.clone(); - let mut paths_and_modes = Vec::new(); - let mut stream = gix_archive::Stream::from_read(tee_read); - - while let Some(mut entry) = stream.next_entry().expect("entry retrieval does not fail") { - paths_and_modes.push((entry.relative_path().to_owned(), entry.mode, entry.id)); - let mut buf = Vec::new(); - entry.read_to_end(&mut buf).expect("stream can always be read"); - if !buf.is_empty() && entry.mode.is_blob() { - if entry.relative_path().contains_str("extra") { - assert!( - buf.find_byte(b'\r').is_none(), - "extra-files are not processed in any way" - ); - } else { - assert!( - buf.find_byte(b'\r').is_some(), - "'{}' did not contain a carriage return as sign of having been filtered", - buf.as_bstr() - ); - if entry.relative_path().ends_with_str(b"streamed") { - assert_eq!(buf.as_bstr(), "➡streamed-by-driver\r\n"); - } - } - } - } - - let expected_extra_exe_mode = if cfg!(windows) { - EntryMode::Blob - } else { - EntryMode::BlobExecutable - }; - assert_eq!( - paths_and_modes, - &[ - ( - ".gitattributes".into(), - EntryMode::Blob, - hex_to_id("45c160c35c17ad264b96431cceb9793160396e99") - ), - ( - "a".into(), - EntryMode::Blob, - hex_to_id("45b983be36b73c0788dc9cbcb76cbb80fc7bb057") - ), - ( - "symlink-to-a".into(), - EntryMode::Link, - hex_to_id("2e65efe2a145dda7ee51d1741299f848e5bf752e") - ), - ( - "dir/.gitattributes".into(), - EntryMode::Blob, - hex_to_id("81b9a375276405703e05be6cecf0fc1c8b8eed64") - ), - ( - "dir/b".into(), - EntryMode::Blob, - hex_to_id("ab4a98190cf776b43cb0fe57cef231fb93fd07e6") - ), - ( - "dir/subdir/exe".into(), - EntryMode::BlobExecutable, - hex_to_id("e69de29bb2d1d6434b8b29ae775ad8c2e48c5391") - ), - ( - "dir/subdir/streamed".into(), - EntryMode::Blob, - hex_to_id("08991f58f4de5d85b61c0f87f3ac053c79d0e739") - ), - ( - "extra-file".into(), - EntryMode::Blob, - hex_to_id("0000000000000000000000000000000000000000") - ), - ( - "extra-exe".into(), - expected_extra_exe_mode, - hex_to_id("0000000000000000000000000000000000000000") - ), - ( - "extra-dir-empty".into(), - EntryMode::Tree, - hex_to_id("0000000000000000000000000000000000000000") - ), - ( - "extra-dir/symlink-to-extra".into(), - EntryMode::Link, - hex_to_id("0000000000000000000000000000000000000000") - ) - ] - ); - assert_eq!( - copy.lock().len(), - 710, - "keep track of file size changes of the streaming format" - ); - - let mut copied_stream = gix_archive::Stream::from_read(std::io::Cursor::new(copy.lock().as_bytes().to_owned())); - let mut copied_paths_and_modes = Vec::new(); - let mut buf = Vec::new(); - while let Some(mut entry) = copied_stream.next_entry().expect("entry retrieval does not fail") { - copied_paths_and_modes.push((entry.relative_path().to_owned(), entry.mode, entry.id)); - buf.clear(); - entry.read_to_end(&mut buf).expect("stream can always be read"); - } - assert_eq!( - copied_paths_and_modes, paths_and_modes, - "a stream copy yields exactly the same result" - ); - Ok(()) - } - - #[test] - fn can_drop_entry_without_reading_it() -> gix_testtools::Result { - let (_dir, head_tree, odb, mut cache) = basic()?; - let mut stream = write_to( - head_tree, - { - let odb = odb.clone(); - move |id, buf| odb.find(id, buf) - }, - mutating_pipeline(false), - move |rela_path, mode, attrs| { - cache - .at_entry(rela_path, mode.is_tree().into(), |id, buf| odb.find_blob(id, buf)) - .map(|entry| entry.matching_attributes(attrs)) - .map(|_| ()) - }, - ); - - drop(stream.next_entry().expect("entry retrieval does not fail")); - Ok(()) - } - - fn basic() -> gix_testtools::Result<(PathBuf, gix_hash::ObjectId, gix_odb::HandleArc, gix_worktree::Cache)> { - let dir = gix_testtools::scripted_fixture_read_only("basic.sh")?; - - let head = { - let hex = std::fs::read(dir.join("head.hex"))?; - gix_hash::ObjectId::from_hex(hex.trim())? - }; - let odb = gix_odb::at(dir.join(".git").join("objects"))?; - - let mut collection = Default::default(); - let mut buf = Default::default(); - let attributes = gix_worktree::cache::state::Attributes::new( - gix_attributes::Search::new_globals(None::, &mut buf, &mut collection)?, - None, - Source::WorktreeThenIdMapping, - collection, - ); - let state = gix_worktree::cache::State::AttributesStack(attributes); - let cache = gix_worktree::Cache::new(&dir, state, Case::Sensitive, Default::default(), Default::default()); - Ok((dir, head, odb.into_arc()?, cache)) - } - - fn mutating_pipeline(driver: bool) -> gix_filter::Pipeline { - gix_filter::Pipeline::new( - &Default::default(), - gix_filter::pipeline::Options { - drivers: if driver { vec![driver_with_process()] } else { vec![] }, - eol_config: gix_filter::eol::Configuration { - auto_crlf: gix_filter::eol::AutoCrlf::Enabled, - ..Default::default() - }, - ..Default::default() - }, - ) - } - - pub(crate) fn driver_with_process() -> gix_filter::Driver { - let mut exe = DRIVER.to_string_lossy().into_owned(); - if cfg!(windows) { - exe = exe.replace('\\', "/"); - } - gix_filter::Driver { - name: "arrow".into(), - clean: None, - smudge: None, - process: Some((exe + " process").into()), - required: true, - } - } - - static DRIVER: Lazy = Lazy::new(|| { - let mut cargo = std::process::Command::new(env!("CARGO")); - let res = cargo - .args(["build", "-p=gix-filter", "--example", "arrow"]) - .status() - .expect("cargo should run fine"); - assert!(res.success(), "cargo invocation should be successful"); - - let path = PathBuf::from(env!("CARGO_TARGET_TMPDIR")) - .ancestors() - .nth(1) - .expect("first parent in target dir") - .join("debug") - .join("examples") - .join(if cfg!(windows) { "arrow.exe" } else { "arrow" }); - assert!(path.is_file(), "Expecting driver to be located at {path:?}"); - path - }); - - struct TeeToMemory { - read: R, - write: Arc>>, - } - - impl std::io::Read for TeeToMemory - where - R: std::io::Read, - { - fn read(&mut self, buf: &mut [u8]) -> std::io::Result { - let nb = self.read.read(buf)?; - self.write.lock().write_all(&buf[..nb])?; - Ok(nb) - } - } -} diff --git a/gix-archive/tests/fixtures/generated-archives/basic.tar.xz b/gix-archive/tests/fixtures/generated-archives/basic.tar.xz deleted file mode 100644 index 91d8d229b40..00000000000 --- a/gix-archive/tests/fixtures/generated-archives/basic.tar.xz +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:939077297db2dbe9917633082205c063bcd35e890b2877d530f2e0f4a75d7e3f -size 11544 diff --git a/gix-worktree-stream/Cargo.toml b/gix-worktree-stream/Cargo.toml new file mode 100644 index 00000000000..a781b67cea8 --- /dev/null +++ b/gix-worktree-stream/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "gix-worktree-stream" +version = "0.2.0" +repository = "https://github.com/Byron/gitoxide" +license = "MIT/Apache-2.0" +description = "generate a byte-stream from a git-tree" +authors = ["Sebastian Thiel "] +edition = "2021" +rust-version = "1.65" + +[lib] +doctest = false + +[dependencies] +gix-features = { version = "^0.32.0", path = "../gix-features", features = ["progress", "io-pipe"] } +gix-hash = { version = "^0.11.3", path = "../gix-hash" } +gix-object = { version = "^0.33.0", path = "../gix-object" } +gix-attributes = { version = "^0.15.0", path = "../gix-attributes" } +gix-filter = { version = "^0.1.0", path = "../gix-filter" } +gix-traverse = { version = "^0.30.0", path = "../gix-traverse" } +gix-fs = { version = "^0.4.0", path = "../gix-fs" } +gix-path = { version = "^0.8.3", path = "../gix-path" } + +thiserror = "1.0.26" +parking_lot = "0.12.1" + +[dev-dependencies] +gix-testtools = { path = "../tests/tools"} +gix-odb = { path = "../gix-odb"} +gix-worktree = { path = "../gix-worktree"} diff --git a/gix-worktree-stream/LICENSE-APACHE b/gix-worktree-stream/LICENSE-APACHE new file mode 120000 index 00000000000..965b606f331 --- /dev/null +++ b/gix-worktree-stream/LICENSE-APACHE @@ -0,0 +1 @@ +../LICENSE-APACHE \ No newline at end of file diff --git a/gix-worktree-stream/LICENSE-MIT b/gix-worktree-stream/LICENSE-MIT new file mode 120000 index 00000000000..76219eb72e8 --- /dev/null +++ b/gix-worktree-stream/LICENSE-MIT @@ -0,0 +1 @@ +../LICENSE-MIT \ No newline at end of file diff --git a/gix-archive/src/stream/entry.rs b/gix-worktree-stream/src/entry.rs similarity index 54% rename from gix-archive/src/stream/entry.rs rename to gix-worktree-stream/src/entry.rs index 801cd32c688..8ffbbc610ad 100644 --- a/gix-archive/src/stream/entry.rs +++ b/gix-worktree-stream/src/entry.rs @@ -1,8 +1,66 @@ -use crate::stream::Entry; -use gix_object::bstr::BStr; +use crate::{protocol, Entry, Stream}; +use gix_object::bstr::{BStr, BString}; use std::io::{ErrorKind, Read}; use std::path::PathBuf; +/// The error returned by [`next_entry()`][Stream::next_entry()]. +#[derive(Debug, thiserror::Error)] +#[allow(missing_docs)] +pub enum Error { + #[error(transparent)] + Io(#[from] std::io::Error), + #[error("Could not find a blob or tree for archival")] + Find(#[source] Box), + #[error("Could not query attributes for path \"{path}\"")] + Attributes { + path: BString, + source: Box, + }, + #[error(transparent)] + Traverse(#[from] gix_traverse::tree::breadthfirst::Error), + #[error(transparent)] + ConvertToWorktree(#[from] gix_filter::pipeline::convert::to_worktree::Error), +} + +impl Stream { + /// Access the next entry of the stream or `None` if there is nothing more to read. + pub fn next_entry(&mut self) -> Result>, Error> { + assert!( + self.path_buf.is_some(), + "BUG: must consume and drop entry before getting the next one" + ); + self.extra_entries.take(); + let res = protocol::read_entry_info( + &mut self.read, + self.path_buf.as_mut().expect("set while producing an entry"), + ); + match res { + Ok((remaining, mode, id)) => { + if let Some(err) = self.err.lock().take() { + return Err(err); + } + Ok(Some(Entry { + path_buf: self.path_buf.take(), + parent: self, + id, + mode, + remaining, + })) + } + Err(err) => { + if let Some(err) = self.err.lock().take() { + return Err(err); + } + // unexpected EOF means the other side dropped. We handled potential errors already. + if err.kind() == ErrorKind::UnexpectedEof { + return Ok(None); + } + Err(err.into()) + } + } + } +} + /// The source of an additional entry pub enum Source { /// There is no content, typically the case with directories which are always considered empty. @@ -39,6 +97,13 @@ impl Entry<'_> { pub fn relative_path(&self) -> &BStr { self.path_buf.as_ref().expect("always set during our lifetime").as_ref() } + + /// The amount of bytes that remain to be read, or `None` if it's fully streamed. + /// + /// This equals the length of the entry in bytes right before reading it. + pub fn bytes_remaining(&self) -> Option { + self.remaining + } } impl<'a> Drop for Entry<'a> { @@ -57,9 +122,7 @@ impl Entry<'_> { let nb = u16::from_le_bytes(u16_buf) as usize; if nb != 0 { - self.parent - .read - .read_exact(&mut self.parent.buf[self.parent.filled..][..nb])?; + self.parent.read.read_exact(&mut self.parent.buf[..nb])?; } self.parent.filled = nb; self.parent.pos = 0; diff --git a/gix-archive/src/write/mod.rs b/gix-worktree-stream/src/from_tree/mod.rs similarity index 86% rename from gix-archive/src/write/mod.rs rename to gix-worktree-stream/src/from_tree/mod.rs index 9c3c2f4c67a..b1dc028566e 100644 --- a/gix-archive/src/write/mod.rs +++ b/gix-worktree-stream/src/from_tree/mod.rs @@ -1,10 +1,12 @@ -use crate::stream::{Error, SharedErrorSlot}; -use crate::{stream, Stream}; +use crate::entry::Error; +use crate::SharedErrorSlot; +use crate::{entry, protocol, AdditionalEntry, Stream}; use gix_object::bstr::BStr; use std::io::Write; /// Use `find` to traverse `tree` and fetch the contained blobs to return as [`Stream`], which makes them queryable /// on demand with support for streaming each entry. +/// /// `pipeline` is used to convert blobs to their worktree representation, and `attributes` is used to read /// the `export-ignore` attribute. If set on a directory or blob, it won't be added to the archive. /// @@ -18,7 +20,7 @@ use std::io::Write; /// /// For per-file progress, integrate progress handling into the calls of [`Stream::next_entry()`] as that /// correlates blobs. -/// Additional interrupt handling can be wrapped around the `Read` implementation of each [`stream::Entry`]. +/// Additional interrupt handling can be wrapped around the `Read` implementation of each [`Entry`][crate::Entry]. /// For progress on bytes-written, integrate progress reporting when consuming the stream. /// Further it's possible to drop the returned [`Stream`] to halt all operation. /// @@ -30,7 +32,7 @@ use std::io::Write; /// ### Limitations /// /// * `export-subst` is not support, as it requires the entire formatting engine of `git log`. -pub fn write_to( +pub fn from_tree( tree: gix_hash::ObjectId, find: Find, pipeline: gix_filter::Pipeline, @@ -77,13 +79,13 @@ where fn run( tree: gix_hash::ObjectId, mut find: Find, - pipeline: gix_filter::Pipeline, + mut pipeline: gix_filter::Pipeline, mut attributes: impl FnMut(&BStr, gix_object::tree::EntryMode, &mut gix_attributes::search::Outcome) -> Result<(), E2> + Send + 'static, out: &mut gix_features::io::pipe::Writer, err: SharedErrorSlot, - additional_entries: std::sync::mpsc::Receiver, + additional_entries: std::sync::mpsc::Receiver, ) -> Result<(), Error> where Find: for<'a> FnMut(&gix_hash::oid, &'a mut Vec) -> Result, E1> + Clone + Send + 'static, @@ -92,6 +94,9 @@ where { let mut buf = Vec::new(); let obj = find(tree.as_ref(), &mut buf).map_err(|err| Error::Find(Box::new(err)))?; + if pipeline.driver_context_mut().treeish.is_none() { + pipeline.driver_context_mut().treeish = Some(tree); + } let tree = gix_object::TreeRefIter::from_bytes(obj.data); let mut attrs = gix_attributes::search::Outcome::default(); @@ -103,10 +108,10 @@ where attrs, find: { let mut find = find.clone(); - move |a: &gix_hash::oid, b: &mut Vec| find(a, b).map_err(|err| stream::Error::Find(Box::new(err))) + move |a: &gix_hash::oid, b: &mut Vec| find(a, b).map_err(|err| Error::Find(Box::new(err))) }, fetch_attributes: move |a: &BStr, b: gix_object::tree::EntryMode, c: &mut gix_attributes::search::Outcome| { - attributes(a, b, c).map_err(|err| stream::Error::Attributes { + attributes(a, b, c).map_err(|err| Error::Attributes { source: Box::new(err), path: a.to_owned(), }) @@ -127,7 +132,7 @@ where )?; for entry in additional_entries { - stream::protocol::write_entry_header_and_path( + protocol::write_entry_header_and_path( entry.relative_path.as_ref(), &entry.id, entry.mode, @@ -137,11 +142,11 @@ where // pipe writer always writes all in one go. #[allow(clippy::unused_io_amount)] match entry.source { - stream::entry::Source::Memory(buf) => out.write(&buf).map(|_| ()), - stream::entry::Source::Null => out.write(&[]).map(|_| ()), - stream::entry::Source::Path(path) => { + entry::Source::Memory(buf) => out.write(&buf).map(|_| ()), + entry::Source::Null => out.write(&[]).map(|_| ()), + entry::Source::Path(path) => { let file = std::fs::File::open(path)?; - stream::protocol::write_stream(&mut buf, file, out) + protocol::write_stream(&mut buf, file, out) } }? } diff --git a/gix-archive/src/write/traverse.rs b/gix-worktree-stream/src/from_tree/traverse.rs similarity index 82% rename from gix-archive/src/write/traverse.rs rename to gix-worktree-stream/src/from_tree/traverse.rs index 78e555509b8..4f6276d9706 100644 --- a/gix-archive/src/write/traverse.rs +++ b/gix-worktree-stream/src/from_tree/traverse.rs @@ -1,5 +1,5 @@ -use crate::stream; -use crate::stream::SharedErrorSlot; +use crate::entry::Error; +use crate::{protocol, SharedErrorSlot}; use gix_filter::driver::apply::MaybeDelayed; use gix_filter::pipeline::convert::ToWorktreeOutcome; use gix_object::bstr::{BStr, BString, ByteSlice, ByteVec}; @@ -11,7 +11,7 @@ use std::io::Write; pub struct Delegate<'a, AttributesFn, FindFn> where - FindFn: for<'b> FnMut(&gix_hash::oid, &'b mut Vec) -> Result, stream::Error> + 'static, + FindFn: for<'b> FnMut(&gix_hash::oid, &'b mut Vec) -> Result, Error> + 'static, { pub(crate) out: &'a mut gix_features::io::pipe::Writer, pub(crate) err: SharedErrorSlot, @@ -26,9 +26,9 @@ where impl Delegate<'_, AttributesFn, FindFn> where - FindFn: for<'b> FnMut(&gix_hash::oid, &'b mut Vec) -> Result, stream::Error> + 'static, - AttributesFn: FnMut(&BStr, gix_object::tree::EntryMode, &mut gix_attributes::search::Outcome) -> Result<(), stream::Error> - + 'static, + FindFn: for<'b> FnMut(&gix_hash::oid, &'b mut Vec) -> Result, Error> + 'static, + AttributesFn: + FnMut(&BStr, gix_object::tree::EntryMode, &mut gix_attributes::search::Outcome) -> Result<(), Error> + 'static, { fn pop_element(&mut self) { if let Some(pos) = self.path.rfind_byte(b'/') { @@ -54,7 +54,7 @@ where .state } - fn handle_entry(&mut self, entry: &tree::EntryRef<'_>) -> Result { + fn handle_entry(&mut self, entry: &tree::EntryRef<'_>) -> Result { if !entry.mode.is_blob_or_symlink() { return Ok(Action::Continue); } @@ -64,6 +64,7 @@ where } (self.find)(entry.oid, &mut self.buf)?; + self.pipeline.driver_context_mut().blob = Some(entry.oid.into()); let converted = self.pipeline.convert_to_worktree( &self.buf, self.path.as_ref(), @@ -77,7 +78,7 @@ where #[allow(clippy::unused_io_amount)] match converted { ToWorktreeOutcome::Unchanged(buf) | ToWorktreeOutcome::Buffer(buf) => { - stream::protocol::write_entry_header_and_path( + protocol::write_entry_header_and_path( self.path.as_ref(), entry.oid, entry.mode, @@ -87,14 +88,8 @@ where self.out.write(buf)?; } ToWorktreeOutcome::Process(MaybeDelayed::Immediate(read)) => { - stream::protocol::write_entry_header_and_path( - self.path.as_ref(), - entry.oid, - entry.mode, - None, - self.out, - )?; - stream::protocol::write_stream(&mut self.buf, read, self.out)?; + protocol::write_entry_header_and_path(self.path.as_ref(), entry.oid, entry.mode, None, self.out)?; + protocol::write_stream(&mut self.buf, read, self.out)?; } ToWorktreeOutcome::Process(MaybeDelayed::Delayed(_)) => { unreachable!("we forbade it") @@ -106,9 +101,9 @@ where impl Visit for Delegate<'_, AttributesFn, FindFn> where - FindFn: for<'a> FnMut(&gix_hash::oid, &'a mut Vec) -> Result, stream::Error> + 'static, - AttributesFn: FnMut(&BStr, gix_object::tree::EntryMode, &mut gix_attributes::search::Outcome) -> Result<(), stream::Error> - + 'static, + FindFn: for<'a> FnMut(&gix_hash::oid, &'a mut Vec) -> Result, Error> + 'static, + AttributesFn: + FnMut(&BStr, gix_object::tree::EntryMode, &mut gix_attributes::search::Outcome) -> Result<(), Error> + 'static, { fn pop_front_tracked_path_and_set_current(&mut self) { self.path = self diff --git a/gix-archive/src/stream/mod.rs b/gix-worktree-stream/src/lib.rs similarity index 75% rename from gix-archive/src/stream/mod.rs rename to gix-worktree-stream/src/lib.rs index 20383e5a4a0..10370ec1c51 100644 --- a/gix-archive/src/stream/mod.rs +++ b/gix-worktree-stream/src/lib.rs @@ -1,29 +1,39 @@ -use crate::Stream; +//! The implementation of creating an archive from a git tree, similar to `git archive`, but using an internal format. +//! +//! This crate can effectively be used to manipulate worktrees as streams of bytes, which can be decoded using the [`Stream`] type. +#![deny(rust_2018_idioms, missing_docs, unsafe_code)] + use gix_object::bstr::BString; use std::path::Path; use std::sync::Arc; -pub(crate) type SharedErrorSlot = Arc>>; - -/// The error returned by [`next_entry()`][Stream::next_entry()]. -#[derive(Debug, thiserror::Error)] -#[allow(missing_docs)] -pub enum Error { - #[error(transparent)] - Io(#[from] std::io::Error), - #[error("Could not find a blob or tree for archival")] - Find(#[source] Box), - #[error("Could not query attributes for path \"{path}\"")] - Attributes { - path: BString, - source: Box, - }, - #[error(transparent)] - Traverse(#[from] gix_traverse::tree::breadthfirst::Error), - #[error(transparent)] - ConvertToWorktree(#[from] gix_filter::pipeline::convert::to_worktree::Error), +/// A stream of entries that originate from a git tree and optionally from additional entries. +/// +/// Note that a git tree is mandatory, but the empty tree can be used to effectively disable it. +pub struct Stream { + read: utils::Read, + err: SharedErrorSlot, + extra_entries: Option>, + // additional_entries: Vec, + /// `None` if currently held by an entry. + path_buf: Option, + /// Another buffer to partially act like a buf-reader. + buf: Vec, + /// The offset into `buf` for entries being able to act like a buf reader. + pos: usize, + /// The amount of bytes usable from `buf` (even though it always has a fixed size) + filled: usize, } +/// +pub mod entry; +pub(crate) mod protocol; + +mod from_tree; +pub use from_tree::from_tree; + +pub(crate) type SharedErrorSlot = Arc>>; + /// An entry in a stream. Note that they must be consumed fully, by reading from them till exhaustion. /// /// ### Drop behaviour @@ -45,7 +55,7 @@ pub struct Entry<'a> { remaining: Option, } -/// An entry that is added to the stream by the user, verbatim, without additional worktree conversions. +/// An entry that is [added to the stream][Stream::add_entry()] by the user, verbatim, without additional worktree conversions. /// /// It may overwrite previously written paths, which may or may not work for the consumer of the stream. pub struct AdditionalEntry { @@ -70,6 +80,14 @@ impl Stream { self.read } + /// Return our internal byte stream from which entries would be generated. + /// + /// Note that the stream must then be consumed in its entirety. + pub fn as_read_mut(&mut self) -> &mut impl std::io::Read { + self.extra_entries.take(); + &mut self.read + } + /// Create a new instance from a stream of bytes in our format. /// /// It must have been created from [`Self::into_read()`] to be compatible, and must @@ -112,7 +130,7 @@ impl Stream { .strip_prefix(root) .map_err(|err| std::io::Error::new(std::io::ErrorKind::Other, err))?; let meta = path.symlink_metadata()?; - let relative_path = gix_path::into_bstr(rela_path).into_owned(); + let relative_path = gix_path::to_unix_separators_on_windows(gix_path::into_bstr(rela_path)).into_owned(); let id = gix_hash::ObjectId::null(gix_hash::Kind::Sha1); let entry = if meta.is_symlink() { @@ -154,7 +172,10 @@ impl Stream { gix_features::io::pipe::Writer, std::sync::mpsc::Receiver, ) { - let in_flight_writes = 3; // 2 = 1 write for entry header, 1 for hash, 1 for entry path + // 1 write for entry header and 1 for hash, 1 for entry path, + 1 for a buffer, then 32 of these. + // Giving some buffer, at the expense of memory, is important to allow consumers to take off bytes more quickly, + // otherwise, both threads effectively run in lock-step and nullify the benefit. + let in_flight_writes = (2 + 1) * 32; let (write, read) = gix_features::io::pipe::unidirectional(in_flight_writes); let (tx_entries, rx_entries) = std::sync::mpsc::channel(); ( @@ -173,8 +194,6 @@ impl Stream { } } -pub(crate) mod entry; -pub(crate) mod protocol; pub(crate) mod utils { pub enum Read { Known(gix_features::io::pipe::Reader), diff --git a/gix-archive/src/stream/protocol.rs b/gix-worktree-stream/src/protocol.rs similarity index 72% rename from gix-archive/src/stream/protocol.rs rename to gix-worktree-stream/src/protocol.rs index f2978f8044f..3afd2ffea36 100644 --- a/gix-archive/src/stream/protocol.rs +++ b/gix-worktree-stream/src/protocol.rs @@ -1,51 +1,11 @@ -use crate::stream::{Entry, Error}; -use crate::{stream, Stream}; +use crate::utils; use gix_object::bstr::{BStr, BString}; use std::io::{ErrorKind, Read, Write}; -impl Stream { - /// Access the next entry of the stream or `None` if there is nothing more to read. - pub fn next_entry(&mut self) -> Result>, Error> { - assert!( - self.path_buf.is_some(), - "BUG: must consume and drop entry before getting the next one" - ); - self.extra_entries.take(); - let res = read_entry_info( - &mut self.read, - self.path_buf.as_mut().expect("set while producing an entry"), - ); - match res { - Ok((remaining, mode, id)) => { - if let Some(err) = self.err.lock().take() { - return Err(err); - } - Ok(Some(Entry { - path_buf: self.path_buf.take(), - parent: self, - id, - mode, - remaining, - })) - } - Err(err) => { - if let Some(err) = self.err.lock().take() { - return Err(err); - } - // unexpected EOF means the other side dropped. We handled potential errors already. - if err.kind() == ErrorKind::UnexpectedEof { - return Ok(None); - } - Err(err.into()) - } - } - } -} - // Format: [usize-LE][usize-LE][byte][byte][hash][relative_path_bytes][object_stream] // Note that stream_len can be usize::MAX to indicate the stream size is unknown -fn read_entry_info( - read: &mut stream::utils::Read, +pub(crate) fn read_entry_info( + read: &mut utils::Read, path_buf: &mut BString, ) -> std::io::Result<(Option, gix_object::tree::EntryMode, gix_hash::ObjectId)> { let mut buf = [0; std::mem::size_of::() * 2 + 2]; @@ -75,7 +35,8 @@ pub(crate) fn write_entry_header_and_path( stream_len: Option, out: &mut gix_features::io::pipe::Writer, ) -> std::io::Result<()> { - let mut buf = [0u8; std::mem::size_of::() * 2 + 2]; + const HEADER_LEN: usize = std::mem::size_of::() * 2 + 2; + let mut buf = [0u8; HEADER_LEN + gix_hash::Kind::longest().len_in_bytes()]; let (path_len_buf, rest) = buf.split_at_mut(std::mem::size_of::()); let (stream_len_buf, bytes) = rest.split_at_mut(std::mem::size_of::()); @@ -83,12 +44,12 @@ pub(crate) fn write_entry_header_and_path( stream_len_buf.copy_from_slice(&stream_len.unwrap_or(usize::MAX).to_le_bytes()); bytes[0] = mode_to_byte(mode); bytes[1] = hash_to_byte(oid.kind()); + bytes[2..][..oid.kind().len_in_bytes()].copy_from_slice(oid.as_bytes()); // We know how `out` works in a pipe writer, it's always writing everything. #[allow(clippy::unused_io_amount)] { - out.write(&buf)?; - out.write(oid.as_bytes())?; + out.write(&buf[..HEADER_LEN + oid.kind().len_in_bytes()])?; out.write(path)?; } Ok(()) diff --git a/gix-worktree-stream/tests/fixtures/basic.sh b/gix-worktree-stream/tests/fixtures/basic.sh new file mode 100644 index 00000000000..f265df938fa --- /dev/null +++ b/gix-worktree-stream/tests/fixtures/basic.sh @@ -0,0 +1,34 @@ +set -eu -o pipefail + +git init + +mkdir dir-ignored +touch dir-ignored/file-ignored-transitively +touch file-ignored + +echo "hi" > a +mkdir dir +echo "ho" > dir/b +mkdir dir/subdir +echo "subdir/streamed filter=arrow" > dir/.gitattributes +echo "streamed-by-driver" > dir/subdir/streamed +touch dir/subdir/exe +chmod +x dir/subdir/exe +ln -s a symlink-to-a + +echo "/dir-ignored/ export-ignore" > .gitattributes +echo "/file-ignored export-ignore" >> .gitattributes + +dd if=/dev/zero of=bigfile bs=1024 count=156 + +git add . +git commit -m "init" + +echo "extra" > extra-file +touch extra-exe && chmod +x extra-exe +mkdir extra-dir-empty extra-dir +ln -s ../extra-file extra-dir/symlink-to-extra +dd if=/dev/zero of=extra-bigfile bs=1024 count=156 + +git rev-parse @^{tree} > head.hex + diff --git a/gix-worktree-stream/tests/fixtures/generated-archives/basic.tar.xz b/gix-worktree-stream/tests/fixtures/generated-archives/basic.tar.xz new file mode 100644 index 00000000000..2bfb7f7e1b4 --- /dev/null +++ b/gix-worktree-stream/tests/fixtures/generated-archives/basic.tar.xz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96d22b17f2347b53f29dd5de8b7c217fde40d0289f1b7af8bbeee5c6dd0002fe +size 11908 diff --git a/gix-worktree-stream/tests/stream.rs b/gix-worktree-stream/tests/stream.rs new file mode 100644 index 00000000000..7571669a51f --- /dev/null +++ b/gix-worktree-stream/tests/stream.rs @@ -0,0 +1,306 @@ +/// Convert a hexadecimal hash into its corresponding `ObjectId` or _panic_. +fn hex_to_id(hex: &str) -> gix_hash::ObjectId { + gix_hash::ObjectId::from_hex(hex.as_bytes()).expect("40 bytes hex") +} + +mod from_tree { + use crate::hex_to_id; + use gix_attributes::glob::pattern::Case; + use gix_object::bstr::ByteSlice; + use gix_object::tree::EntryMode; + use gix_odb::FindExt; + use gix_testtools::once_cell::sync::Lazy; + use gix_worktree::cache::state::attributes::Source; + use std::convert::Infallible; + use std::io::{Error, ErrorKind, Read, Write}; + use std::path::PathBuf; + use std::sync::Arc; + + #[test] + fn can_receive_err_if_root_is_not_found() { + let mut stream = gix_worktree_stream::from_tree( + gix_hash::Kind::Sha1.null(), + |_, _| Err(Error::new(ErrorKind::Other, "object retrieval failed")), + mutating_pipeline(false), + |_, _, _| -> Result<_, Infallible> { unreachable!("must not be called") }, + ); + let err = stream.next_entry().unwrap_err(); + assert_eq!(err.to_string(), "Could not find a blob or tree for archival"); + } + + #[test] + fn can_receive_err_if_attribute_not_found() -> gix_testtools::Result { + let (_dir, head_tree, odb, _cache) = basic()?; + let mut stream = gix_worktree_stream::from_tree( + head_tree, + move |id, buf| odb.find(id, buf), + mutating_pipeline(false), + |_, _, _| Err(Error::new(ErrorKind::Other, "attribute retrieval failed")), + ); + let err = stream.next_entry().unwrap_err(); + assert_eq!( + err.to_string(), + "Could not query attributes for path \".gitattributes\"" + ); + Ok(()) + } + + #[test] + fn will_provide_all_information_and_respect_export_ignore() -> gix_testtools::Result { + let (dir, head_tree, odb, mut cache) = basic()?; + let mut stream = gix_worktree_stream::from_tree( + head_tree, + { + let odb = odb.clone(); + move |id, buf| odb.find(id, buf) + }, + mutating_pipeline(true), + move |rela_path, mode, attrs| { + cache + .at_entry(rela_path, mode.is_tree().into(), |id, buf| odb.find_blob(id, buf)) + .map(|entry| entry.matching_attributes(attrs)) + .map(|_| ()) + }, + ); + stream + .add_entry_from_path(&dir, &dir.join("extra-file"))? + .add_entry_from_path(&dir, &dir.join("extra-bigfile"))? + .add_entry_from_path(&dir, &dir.join("extra-exe"))? + .add_entry_from_path(&dir, &dir.join("extra-dir-empty"))? + .add_entry_from_path(&dir, &dir.join("extra-dir").join("symlink-to-extra"))?; + + let tee_read = TeeToMemory { + read: stream.into_read(), + write: Default::default(), + }; + let copy = tee_read.write.clone(); + let mut paths_and_modes = Vec::new(); + let mut stream = gix_worktree_stream::Stream::from_read(tee_read); + + while let Some(mut entry) = stream.next_entry().expect("entry retrieval does not fail") { + paths_and_modes.push((entry.relative_path().to_owned(), entry.mode, entry.id)); + let mut buf = Vec::new(); + entry.read_to_end(&mut buf).expect("stream can always be read"); + if !buf.is_empty() && entry.mode.is_blob() { + if entry.relative_path().contains_str("extra") { + assert!( + buf.find_byte(b'\r').is_none(), + "extra-files are not processed in any way" + ); + } else if !entry.relative_path().contains_str("big") { + assert!( + buf.find_byte(b'\r').is_some(), + "'{}' did not contain a carriage return as sign of having been filtered", + buf.as_bstr() + ); + if entry.relative_path().ends_with_str(b"streamed") { + assert_eq!(buf.as_bstr(), "➡streamed-by-driver\r\n"); + } + } + } + } + + let expected_extra_exe_mode = if cfg!(windows) { + EntryMode::Blob + } else { + EntryMode::BlobExecutable + }; + assert_eq!( + paths_and_modes, + &[ + ( + ".gitattributes".into(), + EntryMode::Blob, + hex_to_id("45c160c35c17ad264b96431cceb9793160396e99") + ), + ( + "a".into(), + EntryMode::Blob, + hex_to_id("45b983be36b73c0788dc9cbcb76cbb80fc7bb057") + ), + ( + "bigfile".into(), + EntryMode::Blob, + hex_to_id("4995fde49ed64e043977e22539f66a0d372dd129") + ), + ( + "symlink-to-a".into(), + EntryMode::Link, + hex_to_id("2e65efe2a145dda7ee51d1741299f848e5bf752e") + ), + ( + "dir/.gitattributes".into(), + EntryMode::Blob, + hex_to_id("81b9a375276405703e05be6cecf0fc1c8b8eed64") + ), + ( + "dir/b".into(), + EntryMode::Blob, + hex_to_id("ab4a98190cf776b43cb0fe57cef231fb93fd07e6") + ), + ( + "dir/subdir/exe".into(), + EntryMode::BlobExecutable, + hex_to_id("e69de29bb2d1d6434b8b29ae775ad8c2e48c5391") + ), + ( + "dir/subdir/streamed".into(), + EntryMode::Blob, + hex_to_id("08991f58f4de5d85b61c0f87f3ac053c79d0e739") + ), + ( + "extra-file".into(), + EntryMode::Blob, + hex_to_id("0000000000000000000000000000000000000000") + ), + ( + "extra-bigfile".into(), + EntryMode::Blob, + hex_to_id("0000000000000000000000000000000000000000") + ), + ( + "extra-exe".into(), + expected_extra_exe_mode, + hex_to_id("0000000000000000000000000000000000000000") + ), + ( + "extra-dir-empty".into(), + EntryMode::Tree, + hex_to_id("0000000000000000000000000000000000000000") + ), + ( + "extra-dir/symlink-to-extra".into(), + EntryMode::Link, + hex_to_id("0000000000000000000000000000000000000000") + ) + ] + ); + assert_eq!( + copy.lock().len(), + 320302, + "keep track of file size changes of the streaming format" + ); + + let mut copied_stream = + gix_worktree_stream::Stream::from_read(std::io::Cursor::new(copy.lock().as_bytes().to_owned())); + let mut copied_paths_and_modes = Vec::new(); + let mut buf = Vec::new(); + while let Some(mut entry) = copied_stream.next_entry().expect("entry retrieval does not fail") { + copied_paths_and_modes.push((entry.relative_path().to_owned(), entry.mode, entry.id)); + buf.clear(); + entry.read_to_end(&mut buf).expect("stream can always be read"); + } + assert_eq!( + copied_paths_and_modes, paths_and_modes, + "a stream copy yields exactly the same result" + ); + Ok(()) + } + + #[test] + fn can_drop_entry_without_reading_it() -> gix_testtools::Result { + let (_dir, head_tree, odb, mut cache) = basic()?; + let mut stream = gix_worktree_stream::from_tree( + head_tree, + { + let odb = odb.clone(); + move |id, buf| odb.find(id, buf) + }, + mutating_pipeline(false), + move |rela_path, mode, attrs| { + cache + .at_entry(rela_path, mode.is_tree().into(), |id, buf| odb.find_blob(id, buf)) + .map(|entry| entry.matching_attributes(attrs)) + .map(|_| ()) + }, + ); + + drop(stream.next_entry().expect("entry retrieval does not fail")); + Ok(()) + } + + fn basic() -> gix_testtools::Result<(PathBuf, gix_hash::ObjectId, gix_odb::HandleArc, gix_worktree::Cache)> { + let dir = gix_testtools::scripted_fixture_read_only("basic.sh")?; + + let head = { + let hex = std::fs::read(dir.join("head.hex"))?; + gix_hash::ObjectId::from_hex(hex.trim())? + }; + let odb = gix_odb::at(dir.join(".git").join("objects"))?; + + let mut collection = Default::default(); + let mut buf = Default::default(); + let attributes = gix_worktree::cache::state::Attributes::new( + gix_attributes::Search::new_globals(None::, &mut buf, &mut collection)?, + None, + Source::WorktreeThenIdMapping, + collection, + ); + let state = gix_worktree::cache::State::AttributesStack(attributes); + let cache = gix_worktree::Cache::new(&dir, state, Case::Sensitive, Default::default(), Default::default()); + Ok((dir, head, odb.into_arc()?, cache)) + } + + fn mutating_pipeline(driver: bool) -> gix_filter::Pipeline { + gix_filter::Pipeline::new( + &Default::default(), + gix_filter::pipeline::Options { + drivers: if driver { vec![driver_with_process()] } else { vec![] }, + eol_config: gix_filter::eol::Configuration { + auto_crlf: gix_filter::eol::AutoCrlf::Enabled, + ..Default::default() + }, + ..Default::default() + }, + ) + } + + pub(crate) fn driver_with_process() -> gix_filter::Driver { + let mut exe = DRIVER.to_string_lossy().into_owned(); + if cfg!(windows) { + exe = exe.replace('\\', "/"); + } + gix_filter::Driver { + name: "arrow".into(), + clean: None, + smudge: None, + process: Some((exe + " process").into()), + required: true, + } + } + + static DRIVER: Lazy = Lazy::new(|| { + let mut cargo = std::process::Command::new(env!("CARGO")); + let res = cargo + .args(["build", "-p=gix-filter", "--example", "arrow"]) + .status() + .expect("cargo should run fine"); + assert!(res.success(), "cargo invocation should be successful"); + + let path = PathBuf::from(env!("CARGO_TARGET_TMPDIR")) + .ancestors() + .nth(1) + .expect("first parent in target dir") + .join("debug") + .join("examples") + .join(if cfg!(windows) { "arrow.exe" } else { "arrow" }); + assert!(path.is_file(), "Expecting driver to be located at {path:?}"); + path + }); + + struct TeeToMemory { + read: R, + write: Arc>>, + } + + impl std::io::Read for TeeToMemory + where + R: std::io::Read, + { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + let nb = self.read.read(buf)?; + self.write.lock().write_all(&buf[..nb])?; + Ok(nb) + } + } +} From 489abd9b6e44199df2e07f674542f0a5b3c12ad1 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 20 Jul 2023 15:34:12 +0200 Subject: [PATCH 09/18] feat: basic tar support for `gix-archive` It's added as feature toggle. --- Cargo.lock | 21 ++ gix-archive/Cargo.toml | 33 ++- gix-archive/src/lib.rs | 47 ++-- gix-archive/src/write.rs | 123 ++++++++++- gix-archive/tests/archive.rs | 207 ++++++++++++++++++ gix-archive/tests/fixtures/basic.sh | 5 +- .../fixtures/generated-archives/basic.tar.xz | 3 + justfile | 2 + 8 files changed, 416 insertions(+), 25 deletions(-) create mode 100644 gix-archive/tests/fixtures/generated-archives/basic.tar.xz diff --git a/Cargo.lock b/Cargo.lock index e2bbc5a47d2..cda20b22b1e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1398,7 +1398,18 @@ dependencies = [ name = "gix-archive" version = "0.2.0" dependencies = [ + "bstr", + "document-features", + "gix-attributes 0.15.0", + "gix-filter", + "gix-hash 0.11.3", + "gix-object 0.33.0", + "gix-odb", + "gix-path 0.8.3", + "gix-testtools", + "gix-worktree 0.22.0", "gix-worktree-stream", + "tar", "thiserror", ] @@ -4224,6 +4235,7 @@ checksum = "ec96d2ffad078296368d46ff1cb309be1c23c513b4ab0e22a45de0185275ac96" dependencies = [ "filetime", "libc", + "xattr", ] [[package]] @@ -5009,6 +5021,15 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "xattr" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d1526bbe5aaeb5eb06885f4d987bcdfa5e23187055de9b83fe00156a821fabc" +dependencies = [ + "libc", +] + [[package]] name = "xz2" version = "0.1.7" diff --git a/gix-archive/Cargo.toml b/gix-archive/Cargo.toml index fed7d899c0d..fcb4edb2f04 100644 --- a/gix-archive/Cargo.toml +++ b/gix-archive/Cargo.toml @@ -3,7 +3,7 @@ name = "gix-archive" version = "0.2.0" repository = "https://github.com/Byron/gitoxide" license = "MIT OR Apache-2.0" -description = "implements archive generation from of a tree" +description = "archive generation from of a worktree stream" authors = ["Sebastian Thiel "] edition = "2021" rust-version = "1.65" @@ -11,6 +11,35 @@ rust-version = "1.65" [lib] doctest = false +[features] +default = ["tar"] + +## Enable the `tar` archive format. It has support for all information, except for object ids. +tar = ["dep:tar", "dep:gix-path"] + + [dependencies] -thiserror = "1.0.26" gix-worktree-stream = { version = "^0.2.0", path = "../gix-worktree-stream" } +gix-object = { version = "^0.33.0", path = "../gix-object" } +gix-path = { version = "^0.8.3", path = "../gix-path", optional = true } + +thiserror = "1.0.26" +bstr = { version = "1.5.0", default-features = false } + +tar = { version = "0.4.38", optional = true } + +document-features = { version = "0.2.0", optional = true } + +[dev-dependencies] +gix-testtools = { path = "../tests/tools"} +gix-odb = { path = "../gix-odb"} +gix-worktree = { path = "../gix-worktree"} +gix-hash = { path = "../gix-hash"} +gix-attributes = { path = "../gix-attributes"} +gix-object = { path = "../gix-object"} +gix-filter = { path = "../gix-filter"} + +[package.metadata.docs.rs] +all-features = true +features = ["document-features"] +rustdoc-args = ["--cfg", "docsrs"] diff --git a/gix-archive/src/lib.rs b/gix-archive/src/lib.rs index 0b229507f43..541ae378f1b 100644 --- a/gix-archive/src/lib.rs +++ b/gix-archive/src/lib.rs @@ -1,20 +1,42 @@ -//! The implementation of creating an archive from a git tree, similar to `git archive`. +//! The implementation of creating an archive from a worktree stream, similar to `git archive`. +//! +//! ## Deviation +//! +//! This implementation is early and just does the basics. Git does more to support more context when filtering and to keep +//! more information about entries in the various archive formats. +//! `tar` is implemented in a very basic fashion only. +//! +//! ## Feature Flags +//! All features are related to which container formats are available. +#![cfg_attr( + feature = "document-features", + cfg_attr(doc, doc = ::document_features::document_features!()) +)] +#![cfg_attr(docsrs, feature(doc_cfg, doc_auto_cfg))] #![deny(rust_2018_idioms, missing_docs)] #![forbid(unsafe_code)] -/// The error returned by [`write_to()`]. +use bstr::BString; + +/// The error returned by [`write_stream()`]. #[derive(Debug, thiserror::Error)] #[allow(missing_docs)] pub enum Error { #[error(transparent)] Io(#[from] std::io::Error), + #[error(transparent)] + NextStreamEntry(#[from] gix_worktree_stream::entry::Error), + #[error("The internal format cannot be used as an archive, it's merely a debugging tool")] + InternalFormatMustNotPersist, } -/// The supported container formats for use in [`write_to()`]. +/// The supported container formats for use in [`write_stream()`]. #[derive(Default, PartialEq, Eq, Copy, Clone, Debug)] pub enum Format { /// An internal format that is suitable only for intra-process communication. /// + /// All transformations in the options are ignored. Calling [`write_stream`] is disallowed + /// as it's more efficient to call [gix_worktree_stream::Stream::into_read()] right away. /// It is provided here as a basis available without extra dependencies, and as a debugging tool. #[default] InternalTransientNonPersistable, @@ -22,26 +44,19 @@ pub enum Format { /// /// Use it as well if a custom container format is desired. The idea is to decode it on a separate thread /// to rewrite the data to the desired format. + #[cfg(feature = "tar")] Tar, - /// A convenience format that will `zip` deflate the `tar` stream. - TarGz { - /// The compression level to use for the `zlib` compression, ranging from 0 (no compression) to 9 (best compression). - compression_level: u8, - }, - /// Use the zip` container format, instead of `tar`, provided for convenience. - Zip { - /// The compression level to use for the `zlib` compression, ranging from 0 (no compression) to 9 (best compression). - compression_level: u8, - }, } -/// Options for configuring [`write_to()`]. +/// Options for configuring [`write_stream()`]. #[derive(Clone, Debug)] pub struct Options { /// The archive's format. pub format: Format, /// Given a `path`, originating in the git tree, to place into the archive, put `/path` in front of it. - pub tree_prefix: Option, + /// + /// Note that that `/` should be used as separator, and that a prefix directory has to end with `/`. + pub tree_prefix: Option, /// The modification time for all entries in the archive. /// /// Defaults to the current time. The caller may set this to the commit time if available. @@ -59,4 +74,4 @@ impl Default for Options { } mod write; -pub use write::write_to; +pub use write::write_stream; diff --git a/gix-archive/src/write.rs b/gix-archive/src/write.rs index b4dce740099..2671fb88eed 100644 --- a/gix-archive/src/write.rs +++ b/gix-archive/src/write.rs @@ -1,7 +1,122 @@ -use crate::{Error, Options}; -use gix_worktree_stream::Stream; +use crate::{Error, Format, Options}; +use gix_worktree_stream::{Entry, Stream}; -/// Write the worktree `stream` to `out` configured according to `opts`. -pub fn write_to(_stream: &mut Stream, mut _out: impl std::io::Write, _opts: Options) -> Result<(), Error> { +/// Write all stream entries in `stream` as provided by `next_entry(stream)` to `out` configured according to `opts`. +/// +/// ### Performance +/// +/// * The caller should be sure `out` is fast enough. If in doubt, wrap in [`std::io::BufWriter`]. +/// * Further, big files aren't suitable for archival into `tar` archives as they require the size of the stream to be known +/// prior to writing the header of each entry. +#[cfg_attr(not(feature = "tar"), allow(unused_mut, unused_variables))] +pub fn write_stream( + stream: &mut Stream, + mut next_entry: NextFn, + out: impl std::io::Write, + opts: Options, +) -> Result<(), Error> +where + NextFn: FnMut(&mut Stream) -> Result>, gix_worktree_stream::entry::Error>, +{ + if opts.format == Format::InternalTransientNonPersistable { + return Err(Error::InternalFormatMustNotPersist); + } + #[cfg(feature = "tar")] + { + enum State { + #[cfg(feature = "tar")] + Tar((tar::Builder, Vec)), + } + + impl State { + pub fn new(format: Format, out: W) -> Self { + match format { + Format::InternalTransientNonPersistable => unreachable!("handled earlier"), + #[cfg(feature = "tar")] + Format::Tar => State::Tar(( + { + let mut ar = tar::Builder::new(out); + ar.mode(tar::HeaderMode::Deterministic); + ar + }, + Vec::with_capacity(64 * 1024), + )), + } + } + } + + let mut state = State::new(opts.format, out); + let mtime_seconds_since_epoch = opts + .modification_time + .duration_since(std::time::UNIX_EPOCH) + .ok() + .map(|d| d.as_secs()); + + while let Some(mut entry) = next_entry(stream)? { + match &mut state { + #[cfg(feature = "tar")] + State::Tar((ar, buf)) => { + let mut header = tar::Header::new_gnu(); + if let Some(mtime) = mtime_seconds_since_epoch { + header.set_mtime(mtime); + } + header.set_entry_type(tar_entry_type(entry.mode)); + header.set_mode(if matches!(entry.mode, gix_object::tree::EntryMode::BlobExecutable) { + 0o755 + } else { + 0o644 + }); + buf.clear(); + std::io::copy(&mut entry, buf)?; + + let path = gix_path::from_bstr(add_prefix(entry.relative_path(), opts.tree_prefix.as_ref())); + header.set_size(buf.len() as u64); + + if entry.mode == gix_object::tree::EntryMode::Link { + use bstr::ByteSlice; + let target = gix_path::from_bstr(buf.as_bstr()); + header.set_entry_type(tar::EntryType::Symlink); + header.set_size(0); + ar.append_link(&mut header, path, target)?; + } else { + ar.append_data(&mut header, path, buf.as_slice())?; + } + } + } + } + + match state { + #[cfg(feature = "tar")] + State::Tar((mut ar, _)) => { + ar.finish()?; + } + } + } Ok(()) } + +#[cfg(feature = "tar")] +fn tar_entry_type(mode: gix_object::tree::EntryMode) -> tar::EntryType { + use gix_object::tree::EntryMode; + use tar::EntryType; + match mode { + EntryMode::Tree | EntryMode::Commit => EntryType::Directory, + EntryMode::Blob => EntryType::Regular, + EntryMode::BlobExecutable => EntryType::Regular, + EntryMode::Link => EntryType::Link, + } +} + +#[cfg(feature = "tar")] +fn add_prefix<'a>(relative_path: &'a bstr::BStr, prefix: Option<&bstr::BString>) -> std::borrow::Cow<'a, bstr::BStr> { + use std::borrow::Cow; + match prefix { + None => Cow::Borrowed(relative_path), + Some(prefix) => { + use bstr::ByteVec; + let mut buf = prefix.clone(); + buf.push_str(relative_path); + Cow::Owned(buf) + } + } +} diff --git a/gix-archive/tests/archive.rs b/gix-archive/tests/archive.rs index 8b137891791..a58ba972031 100644 --- a/gix-archive/tests/archive.rs +++ b/gix-archive/tests/archive.rs @@ -1 +1,208 @@ +/// Convert a hexadecimal hash into its corresponding `ObjectId` or _panic_. +fn hex_to_id(hex: &str) -> gix_hash::ObjectId { + gix_hash::ObjectId::from_hex(hex.as_bytes()).expect("40 bytes hex") +} +mod from_tree { + use crate::hex_to_id; + use gix_archive::Format; + use gix_attributes::glob::pattern::Case; + use gix_object::tree::EntryMode; + use gix_odb::FindExt; + use gix_testtools::bstr::ByteSlice; + use gix_worktree::cache::state::attributes::Source; + use std::io::Read; + use std::path::PathBuf; + + #[test] + fn basic_usage_internal() -> gix_testtools::Result { + basic_usage(gix_archive::Format::InternalTransientNonPersistable, |buf| { + assert_eq!(buf.len(), 551); + + let mut stream = gix_worktree_stream::Stream::from_read(std::io::Cursor::new(buf)); + let mut paths_and_modes = Vec::new(); + while let Some(mut entry) = stream.next_entry().expect("entry retrieval does not fail") { + paths_and_modes.push((entry.relative_path().to_owned(), entry.mode, entry.id)); + let mut buf = Vec::new(); + entry.read_to_end(&mut buf).expect("stream can always be read"); + } + + let expected_extra_exe_mode = if cfg!(windows) { + EntryMode::Blob + } else { + EntryMode::BlobExecutable + }; + assert_eq!( + paths_and_modes, + &[ + ( + ".gitattributes".into(), + EntryMode::Blob, + hex_to_id("45c160c35c17ad264b96431cceb9793160396e99") + ), + ( + "a".into(), + EntryMode::Blob, + hex_to_id("45b983be36b73c0788dc9cbcb76cbb80fc7bb057") + ), + ( + "symlink-to-a".into(), + EntryMode::Link, + hex_to_id("2e65efe2a145dda7ee51d1741299f848e5bf752e") + ), + ( + "dir/b".into(), + EntryMode::Blob, + hex_to_id("ab4a98190cf776b43cb0fe57cef231fb93fd07e6") + ), + ( + "dir/subdir/exe".into(), + EntryMode::BlobExecutable, + hex_to_id("e69de29bb2d1d6434b8b29ae775ad8c2e48c5391") + ), + ( + "extra-file".into(), + EntryMode::Blob, + hex_to_id("0000000000000000000000000000000000000000") + ), + ( + "extra-exe".into(), + expected_extra_exe_mode, + hex_to_id("0000000000000000000000000000000000000000") + ), + ( + "extra-dir-empty".into(), + EntryMode::Tree, + hex_to_id("0000000000000000000000000000000000000000") + ), + ( + "extra-dir/symlink-to-extra".into(), + EntryMode::Link, + hex_to_id("0000000000000000000000000000000000000000") + ) + ] + ); + Ok(()) + }) + } + + #[test] + #[cfg(feature = "tar")] + fn basic_usage_tar() -> gix_testtools::Result { + basic_usage(gix_archive::Format::Tar, |buf| { + use tar::EntryType; + let mut ar = tar::Archive::new(buf.as_slice()); + let mut out = Vec::new(); + for entry in ar.entries()? { + let mut entry = entry?; + let copied = std::io::copy(&mut entry, &mut std::io::sink())?; + + let header = entry.header(); + assert_eq!( + copied, + header.size()?, + "size field matches the size of the actual stream" + ); + out.push(( + entry.path_bytes().as_bstr().to_owned(), + header.entry_type(), + header.size()?, + header.mode()?, + )); + } + let expected_extra_exe_mode = if cfg!(windows) { 420 } else { 493 }; + assert_eq!( + out, + [ + ("prefix/.gitattributes", EntryType::Regular, 56, 420), + ("prefix/a", EntryType::Regular, 3, 420), + ("prefix/symlink-to-a", EntryType::Symlink, 0, 420), + ("prefix/dir/b", EntryType::Regular, 3, 420), + ("prefix/dir/subdir/exe", EntryType::Regular, 0, 493), + ("prefix/extra-file", EntryType::Regular, 21, 420), + ("prefix/extra-exe", EntryType::Regular, 0, expected_extra_exe_mode), + ("prefix/extra-dir-empty", EntryType::Directory, 0, 420), + ("prefix/extra-dir/symlink-to-extra", EntryType::Symlink, 0, 420) + ] + .into_iter() + .map(|(path, b, c, d)| (bstr::BStr::new(path).to_owned(), b, c, d)) + .collect::>() + ); + Ok(()) + }) + } + + fn basic_usage( + format: gix_archive::Format, + make_assertion: impl FnOnce(Vec) -> gix_testtools::Result, + ) -> gix_testtools::Result { + let (dir, head_tree, odb, mut cache) = basic()?; + let mut stream = gix_worktree_stream::from_tree( + head_tree, + { + let odb = odb.clone(); + move |id, buf| odb.find(id, buf) + }, + noop_pipeline(), + move |rela_path, mode, attrs| { + cache + .at_entry(rela_path, mode.is_tree().into(), |id, buf| odb.find_blob(id, buf)) + .map(|entry| entry.matching_attributes(attrs)) + .map(|_| ()) + }, + ); + stream + .add_entry_from_path(&dir, &dir.join("extra-file"))? + .add_entry_from_path(&dir, &dir.join("extra-exe"))? + .add_entry_from_path(&dir, &dir.join("extra-dir-empty"))? + .add_entry_from_path(&dir, &dir.join("extra-dir").join("symlink-to-extra"))?; + + let mut buf = Vec::new(); + if format == Format::InternalTransientNonPersistable { + std::io::copy(&mut stream.into_read(), &mut buf)?; + } else { + gix_archive::write_stream( + &mut stream, + gix_worktree_stream::Stream::next_entry, + &mut buf, + gix_archive::Options { + format, + tree_prefix: Some("prefix/".into()), + modification_time: std::time::UNIX_EPOCH + std::time::Duration::from_secs(120), + }, + )?; + assert!( + stream.next_entry()?.is_none(), + "stream is exhausted, all written to buf" + ); + } + make_assertion(buf).expect("all tests pass"); + Ok(()) + } + + fn basic() -> gix_testtools::Result<(PathBuf, gix_hash::ObjectId, gix_odb::HandleArc, gix_worktree::Cache)> { + let dir = gix_testtools::scripted_fixture_read_only("basic.sh")?; + + let head = { + let hex = std::fs::read(dir.join("head.hex"))?; + gix_hash::ObjectId::from_hex(hex.trim())? + }; + let odb = gix_odb::at(dir.join(".git").join("objects"))?; + + let mut collection = Default::default(); + let mut buf = Default::default(); + let attributes = gix_worktree::cache::state::Attributes::new( + gix_attributes::Search::new_globals(None::, &mut buf, &mut collection)?, + None, + Source::WorktreeThenIdMapping, + collection, + ); + let state = gix_worktree::cache::State::AttributesStack(attributes); + let cache = gix_worktree::Cache::new(&dir, state, Case::Sensitive, Default::default(), Default::default()); + Ok((dir, head, odb.into_arc()?, cache)) + } + + fn noop_pipeline() -> gix_filter::Pipeline { + gix_filter::Pipeline::new(&Default::default(), Default::default()) + } +} diff --git a/gix-archive/tests/fixtures/basic.sh b/gix-archive/tests/fixtures/basic.sh index 9aad15f089d..e9f3a7f4f1d 100644 --- a/gix-archive/tests/fixtures/basic.sh +++ b/gix-archive/tests/fixtures/basic.sh @@ -10,8 +10,7 @@ echo "hi" > a mkdir dir echo "ho" > dir/b mkdir dir/subdir -echo "subdir/streamed filter=arrow" > dir/.gitattributes -echo "streamed-by-driver" > dir/subdir/streamed + touch dir/subdir/exe chmod +x dir/subdir/exe ln -s a symlink-to-a @@ -22,7 +21,7 @@ echo "/file-ignored export-ignore" >> .gitattributes git add . git commit -m "init" -echo "extra" > extra-file +echo "extra to be streamed" > extra-file touch extra-exe && chmod +x extra-exe mkdir extra-dir-empty extra-dir ln -s ../extra-file extra-dir/symlink-to-extra diff --git a/gix-archive/tests/fixtures/generated-archives/basic.tar.xz b/gix-archive/tests/fixtures/generated-archives/basic.tar.xz new file mode 100644 index 00000000000..817cdcc8260 --- /dev/null +++ b/gix-archive/tests/fixtures/generated-archives/basic.tar.xz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0deb139f62fcd2c3769b151b506d7a853e383d2e16d7ff4a12f46feb7aed24a5 +size 11208 diff --git a/justfile b/justfile index 1c7b213516b..a4bbf4e8fe6 100755 --- a/justfile +++ b/justfile @@ -131,6 +131,8 @@ doc $RUSTDOCFLAGS="-D warnings": # run all unit tests unit-tests: cargo test --all + cargo test -p gix-archive --no-default-features + cargo test -p gix-archive --features tar cd gix-object; \ set -ex; \ cargo test; \ From 4ee285741e6e1cde3a967980fbf48bab20ddbf68 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 20 Jul 2023 19:26:04 +0200 Subject: [PATCH 10/18] feat: optionally make `gix-workspace-stream` available via `Repository::worktree_stream()` That way it's easy to obtain a representation of the worktree in a fully streaming fashion, which is also the basis for `archive`-like functionality. --- Cargo.lock | 1 + Cargo.toml | 4 +-- gix/Cargo.toml | 24 ++++++++++++----- gix/src/id.rs | 2 +- gix/src/repository/mod.rs | 23 +++++++++++++++++ gix/src/repository/worktree.rs | 44 ++++++++++++++++++++++++++++++++ gix/src/worktree/mod.rs | 3 +++ gix/tests/repository/open.rs | 4 +-- gix/tests/repository/worktree.rs | 13 ++++++++++ justfile | 2 +- 10 files changed, 108 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cda20b22b1e..e693835a2fe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1347,6 +1347,7 @@ dependencies = [ "gix-utils 0.1.4", "gix-validate 0.7.6", "gix-worktree 0.22.0", + "gix-worktree-stream", "is_ci", "log", "once_cell", diff --git a/Cargo.toml b/Cargo.toml index 256e87c7671..e4d37fdfc55 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -118,10 +118,10 @@ pretty-cli = [ "gitoxide-core/serde", "prodash/progress-tree", "prodash/progress prodash-render-line-crossterm = ["prodash-render-line", "prodash/render-line-crossterm", "prodash/signal-hook", "is-terminal", "crosstermion"] ## Progress reporting with a TUI, can then be enabled with the `--progress` flag. -prodash-render-tui = ["prodash/render-tui", "prodash/render-tui-crossterm", "prodash/progress-tree", "futures-lite"] +prodash-render-tui = ["prodash/render-tui", "prodash/render-tui-crossterm", "gix/progress-tree", "futures-lite"] ## Progress reporting by visually drawing lines into the terminal without switching to an alternate window. -prodash-render-line = ["prodash/render-line", "prodash-render-line-crossterm", "prodash/progress-tree"] +prodash-render-line = ["prodash/render-line", "prodash-render-line-crossterm", "gix/progress-tree"] ## Prints statistical information to inform about cache efficiency when those are dropped. ## Use this as a way to understand if bigger caches actually produce greater yiedls. diff --git a/gix/Cargo.toml b/gix/Cargo.toml index 1094420695a..bf62a7f4f5b 100644 --- a/gix/Cargo.toml +++ b/gix/Cargo.toml @@ -30,7 +30,7 @@ required-features = ["blocking-network-client"] [features] -default = ["max-performance-safe", "comfort"] +default = ["max-performance-safe", "comfort", "extras"] #! ### Mutually Exclusive Network Client #! Either `async-*` or `blocking-*` versions of these toggles may be enabled at a time. @@ -53,6 +53,15 @@ blocking-http-transport-reqwest-native-tls = ["blocking-http-transport-reqwest", #! ### Other +## Various additional features and capabilities that are not necessarily part of what most users would need. +extras = ["worktree-stream"] + +## Make it possible to turn a tree into a stream of bytes, which can be decoded to entries and turned into various other formats. +worktree-stream = ["gix-worktree-stream"] + +## Various progress-related features that improve the look of progress message units. +comfort = ["gix-features/progress-unit-bytes", "gix-features/progress-unit-human-numbers"] + ## Data structures implement `serde::Serialize` and `serde::Deserialize`. serde = [ "dep:serde", "gix-pack/serde", @@ -72,11 +81,9 @@ serde = [ "dep:serde", "gix-credentials/serde"] ## Re-export the progress tree root which allows to obtain progress from various functions which take `impl gix::Progress`. +## Applications which want to display progress will probably need this implementation. progress-tree = ["prodash/progress-tree"] -## Various progress-related features that improve the look of progress message units. -comfort = ["gix-features/progress-unit-bytes", "gix-features/progress-unit-human-numbers"] - ## Print debugging information about usage of object database caches, useful for tuning cache sizes. cache-efficiency-debug = ["gix-features/cache-efficiency-debug"] @@ -137,8 +144,6 @@ gix-negotiate = { version = "^0.5.0", path = "../gix-negotiate" } gix-path = { version = "^0.8.3", path = "../gix-path" } gix-url = { version = "^0.21.0", path = "../gix-url" } gix-traverse = { version = "^0.30.0", path = "../gix-traverse" } -gix-protocol = { version = "^0.36.0", path = "../gix-protocol", optional = true } -gix-transport = { version = "^0.34.0", path = "../gix-transport", optional = true } gix-diff = { version = "^0.33.0", path = "../gix-diff" } gix-mailmap = { version = "^0.16.0", path = "../gix-mailmap" } gix-features = { version = "^0.32.0", path = "../gix-features", features = ["progress", "once_cell"] } @@ -154,6 +159,13 @@ gix-worktree = { version = "^0.22.0", path = "../gix-worktree" } gix-hashtable = { version = "^0.2.3", path = "../gix-hashtable" } gix-commitgraph = { version = "^0.18.0", path = "../gix-commitgraph" } +gix-worktree-stream = { version = "^0.2.0", path = "../gix-worktree-stream", optional = true } + +# For communication with remotes +gix-protocol = { version = "^0.36.0", path = "../gix-protocol", optional = true } +gix-transport = { version = "^0.34.0", path = "../gix-transport", optional = true } + +# Just to get the progress-tree feature prodash = { version = "25.0", optional = true, default-features = false, features = ["progress-tree"] } once_cell = "1.14.0" signal-hook = { version = "0.3.9", default-features = false } diff --git a/gix/src/id.rs b/gix/src/id.rs index c6dcc5593dc..027b4f4d51e 100644 --- a/gix/src/id.rs +++ b/gix/src/id.rs @@ -5,7 +5,7 @@ use gix_hash::{oid, ObjectId}; use crate::{object::find, revision, Id, Object}; -/// An [object id][ObjectId] infused with `Easy`. +/// An [object id][ObjectId] infused with a [`Repository`][crate::Repository]. impl<'repo> Id<'repo> { /// Find the [`Object`] associated with this object id, and consider it an error if it doesn't exist. /// diff --git a/gix/src/repository/mod.rs b/gix/src/repository/mod.rs index 5833fe19033..67a91fd47cc 100644 --- a/gix/src/repository/mod.rs +++ b/gix/src/repository/mod.rs @@ -82,3 +82,26 @@ pub mod index_or_load_from_head { OpenIndex(#[from] crate::worktree::open_index::Error), } } + +/// +#[cfg(feature = "worktree-stream")] +pub mod worktree_stream { + /// The error returned by [`Repository::worktree_stream()`][crate::Repository::worktree_stream()]. + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error(transparent)] + FindTree(#[from] crate::object::find::existing::Error), + #[error(transparent)] + OpenTree(#[from] gix_traverse::tree::breadthfirst::Error), + #[error(transparent)] + AttributesCache(#[from] crate::repository::attributes::Error), + #[error(transparent)] + FilterPipeline(#[from] crate::filter::pipeline::options::Error), + #[error("Needed {id} to be a tree to turn into a workspace stream, got {actual}")] + NotATree { + id: gix_hash::ObjectId, + actual: gix_object::Kind, + }, + } +} diff --git a/gix/src/repository/worktree.rs b/gix/src/repository/worktree.rs index 756b848fb4d..08f156a9cb6 100644 --- a/gix/src/repository/worktree.rs +++ b/gix/src/repository/worktree.rs @@ -49,4 +49,48 @@ impl crate::Repository { pub fn is_bare(&self) -> bool { self.config.is_bare && self.work_dir().is_none() } + + /// If `id` points to a tree, produce a stream that yields one worktree entry after the other. The index of the tree at `id` + /// is returned as well as it is an intermediate byproduct that might be useful to callers. + /// + /// The entries will look exactly like they would if one would check them out, with filters applied. + /// The `export-ignore` attribute is used to skip blobs or directories to which it applies. + #[cfg(feature = "worktree-stream")] + pub fn worktree_stream( + &self, + id: impl Into, + ) -> Result<(gix_worktree_stream::Stream, gix_index::File), crate::repository::worktree_stream::Error> { + use gix_odb::{FindExt, HeaderExt}; + let id = id.into(); + let header = self.objects.header(id)?; + if !header.kind().is_tree() { + return Err(crate::repository::worktree_stream::Error::NotATree { + id: id.to_owned(), + actual: header.kind(), + }); + } + + // TODO(perf): potential performance improvements could be to use the index at `HEAD` if possible (`index_from_head_tree…()`) + // TODO(perf): when loading a non-HEAD tree, we effectively traverse the tree twice. This is usually fast though, and sharing + // an object cache between the copies of the ODB handles isn't trivial and needs a lock. + let index = self.index_from_tree(&id)?; + let mut cache = self.attributes_only(&index, gix_worktree::cache::state::attributes::Source::IdMapping)?; + let pipeline = + gix_filter::Pipeline::new(cache.attributes_collection(), crate::filter::Pipeline::options(self)?); + let objects = self.objects.clone().into_arc().expect("TBD error handling"); + let stream = gix_worktree_stream::from_tree( + id, + { + let objects = objects.clone(); + move |id, buf| objects.find(id, buf) + }, + pipeline, + move |path, mode, attrs| -> std::io::Result<()> { + let entry = cache.at_entry(path, Some(mode.is_tree()), |id, buf| objects.find_blob(id, buf))?; + entry.matching_attributes(attrs); + Ok(()) + }, + ); + Ok((stream, index)) + } } diff --git a/gix/src/worktree/mod.rs b/gix/src/worktree/mod.rs index f61a72f93f3..1f88b327785 100644 --- a/gix/src/worktree/mod.rs +++ b/gix/src/worktree/mod.rs @@ -7,6 +7,9 @@ use crate::{ Repository, }; +#[cfg(feature = "worktree-stream")] +pub use gix_worktree_stream as stream; + pub(crate) type IndexStorage = gix_features::threading::OwnShared>; /// A lazily loaded and auto-updated worktree index. pub type Index = gix_fs::SharedFileSnapshot; diff --git a/gix/tests/repository/open.rs b/gix/tests/repository/open.rs index d330f9490ba..3aafd1c01af 100644 --- a/gix/tests/repository/open.rs +++ b/gix/tests/repository/open.rs @@ -164,8 +164,8 @@ mod object_caches { fn default_git_and_custom_caches() -> crate::Result { let opts = gix::open::Options::isolated(); let repo = named_subrepo_opts("make_config_repos.sh", "object-caches", opts)?; - assert!(repo.objects.has_object_cache()); - assert!(repo.objects.has_pack_cache()); + assert_eq!(repo.objects.has_object_cache(), cfg!(feature = "comfort")); + assert_eq!(repo.objects.has_pack_cache(), cfg!(feature = "comfort")); Ok(()) } diff --git a/gix/tests/repository/worktree.rs b/gix/tests/repository/worktree.rs index ceb35bd8112..f2d7462bf84 100644 --- a/gix/tests/repository/worktree.rs +++ b/gix/tests/repository/worktree.rs @@ -1,5 +1,18 @@ use gix_ref::bstr; +#[test] +#[cfg(feature = "worktree-stream")] +fn stream() -> crate::Result { + let repo = crate::named_repo("make_packed_and_loose.sh")?; + let mut stream = repo.worktree_stream(repo.head_commit()?.tree_id()?)?.0.into_read(); + assert_eq!( + std::io::copy(&mut stream, &mut std::io::sink())?, + 102, + "there is some content in the stream, it works" + ); + Ok(()) +} + mod with_core_worktree_config { use std::io::BufRead; diff --git a/justfile b/justfile index a4bbf4e8fe6..f7feac137c3 100755 --- a/justfile +++ b/justfile @@ -163,7 +163,7 @@ unit-tests: cargo test -p gix-protocol --features blocking-client cargo test -p gix-protocol --features async-client cargo test -p gix-protocol - cargo test -p gix + cargo test -p gix --no-default-features cargo test -p gix --features async-network-client cargo test -p gix --features blocking-network-client cargo test -p gix --features regex From 717950977fa758812bc4dd5713f96995bddc491a Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 21 Jul 2023 12:23:46 +0200 Subject: [PATCH 11/18] feat: add `interrupt::Write` to auto-fail writes on interrupt. --- gix-features/src/interrupt.rs | 38 ++++++++++++++++++++++++++++++++++- gix-features/src/progress.rs | 9 +++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/gix-features/src/interrupt.rs b/gix-features/src/interrupt.rs index 1f78e613aa9..dc7a2db170f 100644 --- a/gix-features/src/interrupt.rs +++ b/gix-features/src/interrupt.rs @@ -91,7 +91,7 @@ where /// A wrapper for implementors of [`std::io::Read`] or [`std::io::BufRead`] with interrupt support. /// -/// It fails a [read][`std::io::Read::read`] while an interrupt was requested. +/// It fails a [read][std::io::Read::read] while an interrupt was requested. pub struct Read<'a, R> { /// The actual implementor of [`std::io::Read`] to which interrupt support will be added. pub inner: R, @@ -123,3 +123,39 @@ where self.inner.consume(amt) } } + +/// A wrapper for implementors of [`std::io::Write`] with interrupt checks on each write call. +/// +/// It fails a [write][std::io::Write::write] while an interrupt was requested. +pub struct Write<'a, W> { + /// The actual implementor of [`std::io::Write`] to which interrupt support will be added. + pub inner: W, + /// The flag to trigger interruption + pub should_interrupt: &'a AtomicBool, +} + +impl io::Write for Write<'_, W> +where + W: std::io::Write, +{ + fn write(&mut self, buf: &[u8]) -> io::Result { + if self.should_interrupt.load(Ordering::Relaxed) { + return Err(std::io::Error::new(std::io::ErrorKind::Other, "Interrupted")); + } + self.inner.write(buf) + } + + fn flush(&mut self) -> io::Result<()> { + // Don't interrupt here, allow flushes to happen to prefer disk consistency. + self.inner.flush() + } +} + +impl io::Seek for Write<'_, W> +where + W: std::io::Seek, +{ + fn seek(&mut self, pos: io::SeekFrom) -> io::Result { + self.inner.seek(pos) + } +} diff --git a/gix-features/src/progress.rs b/gix-features/src/progress.rs index 8d1e30bc450..6a90d84227d 100644 --- a/gix-features/src/progress.rs +++ b/gix-features/src/progress.rs @@ -136,3 +136,12 @@ where self.inner.flush() } } + +impl io::Seek for Write +where + T: io::Seek, +{ + fn seek(&mut self, pos: io::SeekFrom) -> io::Result { + self.inner.seek(pos) + } +} From c4a1fb1ba461c28ac3ea2482adf5f75721d14706 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 21 Jul 2023 10:44:46 +0200 Subject: [PATCH 12/18] feat: add `Repository::archive()` as extra It implements a high-level interface to achieve `git archive` like functionality. --- Cargo.lock | 1 + gix/Cargo.toml | 9 ++++++- gix/src/repository/mod.rs | 7 +++++ gix/src/repository/worktree.rs | 45 ++++++++++++++++++++++++++++++++ gix/src/worktree/mod.rs | 2 ++ gix/tests/repository/worktree.rs | 18 +++++++++++++ 6 files changed, 81 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index e693835a2fe..39dfcab7f1b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1310,6 +1310,7 @@ dependencies = [ "async-std", "document-features", "gix-actor 0.24.0", + "gix-archive", "gix-attributes 0.15.0", "gix-commitgraph", "gix-config", diff --git a/gix/Cargo.toml b/gix/Cargo.toml index bf62a7f4f5b..31f257ee0b8 100644 --- a/gix/Cargo.toml +++ b/gix/Cargo.toml @@ -54,11 +54,17 @@ blocking-http-transport-reqwest-native-tls = ["blocking-http-transport-reqwest", #! ### Other ## Various additional features and capabilities that are not necessarily part of what most users would need. -extras = ["worktree-stream"] +extras = ["worktree-stream", "worktree-archive"] ## Make it possible to turn a tree into a stream of bytes, which can be decoded to entries and turned into various other formats. worktree-stream = ["gix-worktree-stream"] +## Create archives from a tree in the repository, similar to what `git archive` does. +## +## Note that we disable all default features which strips it off all container support, like `tar` and `zip`. +## Your application should add it as dependency and re-activate the desired features. +worktree-archive = ["gix-archive", "worktree-stream"] + ## Various progress-related features that improve the look of progress message units. comfort = ["gix-features/progress-unit-bytes", "gix-features/progress-unit-human-numbers"] @@ -160,6 +166,7 @@ gix-hashtable = { version = "^0.2.3", path = "../gix-hashtable" } gix-commitgraph = { version = "^0.18.0", path = "../gix-commitgraph" } gix-worktree-stream = { version = "^0.2.0", path = "../gix-worktree-stream", optional = true } +gix-archive = { version = "0.2.0", path = "../gix-archive", default-features = false, optional = true } # For communication with remotes gix-protocol = { version = "^0.36.0", path = "../gix-protocol", optional = true } diff --git a/gix/src/repository/mod.rs b/gix/src/repository/mod.rs index 67a91fd47cc..12000e407f4 100644 --- a/gix/src/repository/mod.rs +++ b/gix/src/repository/mod.rs @@ -105,3 +105,10 @@ pub mod worktree_stream { }, } } + +/// +#[cfg(feature = "worktree-archive")] +pub mod worktree_archive { + /// The error returned by [`Repository::worktree_archive()`][crate::Repository::worktree_archive()]. + pub type Error = gix_archive::Error; +} diff --git a/gix/src/repository/worktree.rs b/gix/src/repository/worktree.rs index 08f156a9cb6..bfdb093078b 100644 --- a/gix/src/repository/worktree.rs +++ b/gix/src/repository/worktree.rs @@ -93,4 +93,49 @@ impl crate::Repository { ); Ok((stream, index)) } + + /// Produce an archive from the `stream` and write it to `out` according to `options`. + /// Use `blob` to provide progress for each entry written to `out`, and note that it should already be initialized to the amount + /// of expected entries, with `should_interrupt` being queried between each entry to abort if needed, and on each write to `out`. + /// + /// ### Performance + /// + /// Be sure that `out` is able to handle a lot of write calls. Otherwise wrap it in a [`BufWriter`][std::io::BufWriter]. + /// + /// ### Additional progress and fine-grained interrupt handling + /// + /// For additional progress reporting, wrap `out` into a writer that counts throughput on each write. + /// This can also be used to react to interrupts on each write, instead of only for each entry. + #[cfg(feature = "worktree-archive")] + pub fn worktree_archive( + &self, + mut stream: gix_worktree_stream::Stream, + out: impl std::io::Write, + mut blobs: impl gix_features::progress::Progress, + should_interrupt: &std::sync::atomic::AtomicBool, + options: gix_archive::Options, + ) -> Result<(), crate::repository::worktree_archive::Error> { + let mut out = gix_features::interrupt::Write { + inner: out, + should_interrupt, + }; + if options.format == gix_archive::Format::InternalTransientNonPersistable { + std::io::copy(&mut stream.into_read(), &mut out)?; + return Ok(()); + } + gix_archive::write_stream( + &mut stream, + |stream| { + if should_interrupt.load(std::sync::atomic::Ordering::Relaxed) { + return Err(std::io::Error::new(std::io::ErrorKind::Other, "Cancelled by user").into()); + } + let res = stream.next_entry(); + blobs.inc(); + res + }, + out, + options, + )?; + Ok(()) + } } diff --git a/gix/src/worktree/mod.rs b/gix/src/worktree/mod.rs index 1f88b327785..473fd14d911 100644 --- a/gix/src/worktree/mod.rs +++ b/gix/src/worktree/mod.rs @@ -7,6 +7,8 @@ use crate::{ Repository, }; +#[cfg(feature = "worktree-archive")] +pub use gix_archive as archive; #[cfg(feature = "worktree-stream")] pub use gix_worktree_stream as stream; diff --git a/gix/tests/repository/worktree.rs b/gix/tests/repository/worktree.rs index f2d7462bf84..a63c58d84ce 100644 --- a/gix/tests/repository/worktree.rs +++ b/gix/tests/repository/worktree.rs @@ -13,6 +13,24 @@ fn stream() -> crate::Result { Ok(()) } +#[test] +#[cfg(feature = "worktree-archive")] +fn archive() -> crate::Result { + let repo = crate::named_repo("make_packed_and_loose.sh")?; + let (stream, _index) = repo.worktree_stream(repo.head_commit()?.tree_id()?)?; + let mut buf = Vec::::new(); + + repo.worktree_archive( + stream, + &mut buf, + gix_features::progress::Discard, + &std::sync::atomic::AtomicBool::default(), + Default::default(), + )?; + assert_eq!(buf.len(), 102, "default format is internal"); + Ok(()) +} + mod with_core_worktree_config { use std::io::BufRead; From 32bbb8b7b1f195adf7e5f06fd2ddc19153516a2f Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 21 Jul 2023 14:23:30 +0200 Subject: [PATCH 13/18] feat: add simple CLI for `gix archive` --- Cargo.lock | 1 + Cargo.toml | 5 +- gitoxide-core/Cargo.toml | 4 ++ gitoxide-core/src/repository/archive.rs | 80 +++++++++++++++++++++++++ gitoxide-core/src/repository/mod.rs | 2 + src/plumbing/main.rs | 27 +++++++++ src/plumbing/options/mod.rs | 34 +++++++++++ 7 files changed, 152 insertions(+), 1 deletion(-) create mode 100644 gitoxide-core/src/repository/archive.rs diff --git a/Cargo.lock b/Cargo.lock index 39dfcab7f1b..a40f0d2adb7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1282,6 +1282,7 @@ dependencies = [ "futures-io", "futures-lite", "gix", + "gix-archive", "gix-pack", "gix-transport", "gix-url", diff --git a/Cargo.toml b/Cargo.toml index e4d37fdfc55..03c0b44c95b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -128,7 +128,7 @@ prodash-render-line = ["prodash/render-line", "prodash-render-line-crossterm", " cache-efficiency-debug = ["gix-features/cache-efficiency-debug"] ## A way to enable most `gitoxide-core` tools found in `ein tools`, namely `organize` and `estimate hours`. -gitoxide-core-tools = ["gitoxide-core/organize", "gitoxide-core/estimate-hours"] +gitoxide-core-tools = ["gitoxide-core/organize", "gitoxide-core/estimate-hours", "gitoxide-core-tools-archive"] ## A program to perform analytics on a `git` repository, using an auto-maintained sqlite database gitoxide-core-tools-query = ["gitoxide-core/query"] @@ -136,6 +136,9 @@ gitoxide-core-tools-query = ["gitoxide-core/query"] ## A program to run algorithms on a corpus of repositories, recording each run for later comparison. gitoxide-core-tools-corpus = ["gitoxide-core/corpus"] +## A sub-command to generate archive from virtual worktree checkouts. +gitoxide-core-tools-archive = ["gitoxide-core/archive"] + #! ### Building Blocks for mutually exclusive networking #! Blocking and async features are mutually exclusive and cause a compile-time error. This also means that `cargo … --all-features` will fail. #! Within each section, features can be combined. diff --git a/gitoxide-core/Cargo.toml b/gitoxide-core/Cargo.toml index f270c6e61ea..eafe20907f9 100644 --- a/gitoxide-core/Cargo.toml +++ b/gitoxide-core/Cargo.toml @@ -25,6 +25,9 @@ query = ["dep:rusqlite"] ## *Note that* `organize` we need for finding git repositories fast. corpus = [ "dep:rusqlite", "dep:sysinfo", "organize", "dep:crossbeam-channel", "dep:serde_json", "dep:tracing-forest", "dep:tracing-subscriber", "dep:tracing", "dep:parking_lot" ] +## The ability to create archives from virtual worktrees, similar to `git archive`. +archive = ["dep:gix-archive-for-configuration-only", "gix/worktree-archive"] + #! ### Mutually Exclusive Networking #! If both are set, _blocking-client_ will take precedence, allowing `--all-features` to be used. @@ -44,6 +47,7 @@ serde = ["gix/serde", "dep:serde_json", "dep:serde", "bytesize/serde"] gix = { version = "^0.49.1", path = "../gix", default-features = false } gix-pack-for-configuration-only = { package = "gix-pack", version = "^0.40.0", path = "../gix-pack", default-features = false, features = ["pack-cache-lru-dynamic", "pack-cache-lru-static"] } gix-transport-configuration-only = { package = "gix-transport", version = "^0.34.0", path = "../gix-transport", default-features = false } +gix-archive-for-configuration-only = { package = "gix-archive", version = "^0.2.0", path = "../gix-archive", optional = true, features = ["tar"] } serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] } anyhow = "1.0.42" thiserror = "1.0.34" diff --git a/gitoxide-core/src/repository/archive.rs b/gitoxide-core/src/repository/archive.rs new file mode 100644 index 00000000000..32e5949f085 --- /dev/null +++ b/gitoxide-core/src/repository/archive.rs @@ -0,0 +1,80 @@ +use anyhow::bail; +use gix::worktree::archive; +use gix::Progress; +use std::ops::Add; +use std::path::Path; + +pub fn stream( + repo: gix::Repository, + destination_path: Option<&Path>, + rev_spec: Option<&str>, + mut progress: impl Progress, + format: Option, +) -> anyhow::Result<()> { + let format = format.map_or_else(|| format_from_ext(destination_path), Ok)?; + let object = repo.rev_parse_single(rev_spec.unwrap_or("HEAD"))?.object()?; + let (modification_date, tree) = fetch_rev_info(object)?; + + let start = std::time::Instant::now(); + let (stream, index) = repo.worktree_stream(tree)?; + + let mut entries = progress.add_child("entries"); + entries.init(Some(index.entries().len()), gix::progress::count("entries")); + let mut bytes = progress.add_child("written"); + bytes.init(None, gix::progress::bytes()); + + let mut file = gix::progress::Write { + inner: match destination_path { + Some(path) => Box::new(std::io::BufWriter::with_capacity( + 128 * 1024, + std::fs::File::create(path)?, + )) as Box, + None => Box::new(std::io::sink()), + }, + progress: &mut bytes, + }; + repo.worktree_archive( + stream, + &mut file, + &mut entries, + &gix::interrupt::IS_INTERRUPTED, + gix::worktree::archive::Options { + format, + tree_prefix: None, + modification_time: modification_date + .map(|t| std::time::UNIX_EPOCH.add(std::time::Duration::from_secs(t as u64))) + .unwrap_or_else(std::time::SystemTime::now), + }, + )?; + + entries.show_throughput(start); + bytes.show_throughput(start); + + Ok(()) +} + +fn fetch_rev_info( + object: gix::Object<'_>, +) -> anyhow::Result<(Option, gix::ObjectId)> { + Ok(match object.kind { + gix::object::Kind::Commit => { + let commit = object.into_commit(); + (Some(commit.committer()?.time.seconds), commit.tree_id()?.detach()) + } + gix::object::Kind::Tree => (None, object.id), + gix::object::Kind::Tag => fetch_rev_info(object.peel_to_kind(gix::object::Kind::Commit)?)?, + gix::object::Kind::Blob => bail!("Cannot derive commit or tree from blob at {}", object.id), + }) +} + +fn format_from_ext(path: Option<&Path>) -> anyhow::Result { + Ok(match path { + Some(path) => match path.extension().and_then(|ext| ext.to_str()) { + None => bail!("Cannot derive archive format from a file without extension"), + Some("tar") => archive::Format::Tar, + Some("stream") => archive::Format::InternalTransientNonPersistable, + Some(ext) => bail!("Format for extendion '{ext}' is unsupported"), + }, + None => archive::Format::InternalTransientNonPersistable, + }) +} diff --git a/gitoxide-core/src/repository/mod.rs b/gitoxide-core/src/repository/mod.rs index d373d62339d..da20a66d49c 100644 --- a/gitoxide-core/src/repository/mod.rs +++ b/gitoxide-core/src/repository/mod.rs @@ -11,6 +11,8 @@ pub fn init(directory: Option) -> Result Result<()> { })?; match cmd { + #[cfg(feature = "gitoxide-core-tools-archive")] + Subcommands::Archive(crate::plumbing::options::archive::Platform { + format, + output_file, + treeish, + }) => prepare_and_run( + "archive", + trace, + auto_verbose, + progress, + progress_keep_open, + None, + move |progress, _out, _err| { + core::repository::archive::stream( + repository(Mode::Lenient)?, + output_file.as_deref(), + treeish.as_deref(), + progress, + format.map(|f| match f { + crate::plumbing::options::archive::Format::Internal => { + gix::worktree::archive::Format::InternalTransientNonPersistable + } + crate::plumbing::options::archive::Format::Tar => gix::worktree::archive::Format::Tar, + }), + ) + }, + ), #[cfg(feature = "gitoxide-core-tools-corpus")] Subcommands::Corpus(crate::plumbing::options::corpus::Platform { db, path, cmd }) => { let reverse_trace_lines = progress; diff --git a/src/plumbing/options/mod.rs b/src/plumbing/options/mod.rs index b04cd355cd9..79333922d53 100644 --- a/src/plumbing/options/mod.rs +++ b/src/plumbing/options/mod.rs @@ -77,6 +77,9 @@ pub struct Args { #[derive(Debug, clap::Subcommand)] pub enum Subcommands { + /// Subcommands for creating worktree archivs + #[cfg(feature = "gitoxide-core-tools-archive")] + Archive(archive::Platform), /// Subcommands for interacting with commit-graphs #[clap(subcommand)] CommitGraph(commitgraph::Subcommands), @@ -129,6 +132,37 @@ pub enum Subcommands { Free(free::Subcommands), } +#[cfg(feature = "gitoxide-core-tools-archive")] +pub mod archive { + use std::path::PathBuf; + + #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, clap::ValueEnum)] + pub enum Format { + /// An internal format that is for debugging, it should not be persisted and cannot be read back. + /// + /// However, it represents that bare data stream without with minimal overhead, and is a good + /// metric for throughput. + Internal, + /// Use the `.tar` file format, uncompressed. + Tar, + } + + #[derive(Debug, clap::Parser)] + pub struct Platform { + #[clap(long, short = 'f', value_enum)] + pub format: Option, + /// The file to write the archive to, or discard the output immediately. + /// + /// It's extension determines the archive format, unless `--format` is set. + pub output_file: Option, + + /// The revspec of the commit or tree to traverse, or the tree at `HEAD` if unspecified. + /// + /// If commit, the commit timestamp will be used as timestamp for each file in the archive. + pub treeish: Option, + } +} + #[cfg(feature = "gitoxide-core-tools-corpus")] pub mod corpus { use std::path::PathBuf; From b9b9e9e3da0692dd9b3acc4270b286d72b9c718e Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 21 Jul 2023 16:24:15 +0200 Subject: [PATCH 14/18] feat: add `zip` support for `gix-archive`, with the similarly named cargo feature. --- Cargo.lock | 83 +++++++++++++ gix-archive/Cargo.toml | 12 +- gix-archive/src/lib.rs | 35 +++++- gix-archive/src/write.rs | 220 +++++++++++++++++++++++++++-------- gix-archive/tests/archive.rs | 90 ++++++++++++-- justfile | 2 + 6 files changed, 379 insertions(+), 63 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a40f0d2adb7..93519712220 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,6 +17,12 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "adler32" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234" + [[package]] name = "ahash" version = "0.8.3" @@ -376,6 +382,12 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" +[[package]] +name = "byteorder" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" + [[package]] name = "bytes" version = "1.4.0" @@ -620,6 +632,15 @@ version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" +[[package]] +name = "core2" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b49ba7ef1ad6107f8824dbe97de947cbaac53c44e7f9756a1fba0d37c1eec505" +dependencies = [ + "memchr", +] + [[package]] name = "cpufeatures" version = "0.2.9" @@ -858,6 +879,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "dary_heap" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7762d17f1241643615821a8455a0b2c3e803784b058693d990b11f2dce25a0ca" + [[package]] name = "dashmap" version = "5.5.0" @@ -1404,6 +1431,7 @@ dependencies = [ "bstr", "document-features", "gix-attributes 0.15.0", + "gix-date 0.7.0", "gix-filter", "gix-hash 0.11.3", "gix-object 0.33.0", @@ -1412,8 +1440,11 @@ dependencies = [ "gix-testtools", "gix-worktree 0.22.0", "gix-worktree-stream", + "libflate", "tar", "thiserror", + "time", + "zip", ] [[package]] @@ -2621,6 +2652,15 @@ version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" +[[package]] +name = "hashbrown" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e" +dependencies = [ + "ahash", +] + [[package]] name = "hashbrown" version = "0.14.0" @@ -3024,6 +3064,30 @@ version = "0.2.147" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" +[[package]] +name = "libflate" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7d5654ae1795afc7ff76f4365c2c8791b0feb18e8996a96adad8ffd7c3b2bf" +dependencies = [ + "adler32", + "core2", + "crc32fast", + "dary_heap", + "libflate_lz77", +] + +[[package]] +name = "libflate_lz77" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be5f52fb8c451576ec6b79d3f4deb327398bc05bbdbd99021a6e77a4c855d524" +dependencies = [ + "core2", + "hashbrown 0.13.2", + "rle-decode-fast", +] + [[package]] name = "libgit2-sys" version = "0.15.2+1.6.4" @@ -3810,6 +3874,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "rle-decode-fast" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422" + [[package]] name = "rusqlite" version = "0.29.0" @@ -5056,3 +5126,16 @@ name = "yansi" version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09041cd90cf85f7f8b2df60c646f853b7f535ce68f85244eb6731cf89fa498ec" + +[[package]] +name = "zip" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "760394e246e4c28189f19d488c058bf16f564016aefac5d32bb1f3b51d5e9261" +dependencies = [ + "byteorder", + "crc32fast", + "crossbeam-utils", + "flate2", + "time", +] diff --git a/gix-archive/Cargo.toml b/gix-archive/Cargo.toml index fcb4edb2f04..92398d6bb30 100644 --- a/gix-archive/Cargo.toml +++ b/gix-archive/Cargo.toml @@ -12,16 +12,26 @@ rust-version = "1.65" doctest = false [features] -default = ["tar"] +default = ["tar", "tar_gz", "zip"] ## Enable the `tar` archive format. It has support for all information, except for object ids. tar = ["dep:tar", "dep:gix-path"] +## Enable the `tar.gz` archive format. +tar_gz = ["tar", "dep:libflate"] + +## Enable the `zip` archive format. +zip = ["dep:zip", "dep:time"] [dependencies] gix-worktree-stream = { version = "^0.2.0", path = "../gix-worktree-stream" } gix-object = { version = "^0.33.0", path = "../gix-object" } gix-path = { version = "^0.8.3", path = "../gix-path", optional = true } +gix-date = { version = "^0.7.0", path = "../gix-date" } + +libflate = { version = "2.0.0", optional = true } +zip = { version = "0.6.6", optional = true, default-features = false, features = ["deflate", "time"] } +time = { version = "0.3.22", optional = true, default-features = false, features = ["std"] } thiserror = "1.0.26" bstr = { version = "1.5.0", default-features = false } diff --git a/gix-archive/src/lib.rs b/gix-archive/src/lib.rs index 541ae378f1b..c0434132cab 100644 --- a/gix-archive/src/lib.rs +++ b/gix-archive/src/lib.rs @@ -28,6 +28,12 @@ pub enum Error { NextStreamEntry(#[from] gix_worktree_stream::entry::Error), #[error("The internal format cannot be used as an archive, it's merely a debugging tool")] InternalFormatMustNotPersist, + #[error("Support for the format '{wanted:?}' was not compiled in")] + SupportNotCompiledIn { wanted: Format }, + #[error("Cannot create a zip archive if output stream does not support seek")] + ZipWithoutSeek, + #[error("Cannot use modification as it is not within the supported bounds")] + InvalidModificationTime(#[source] Box), } /// The supported container formats for use in [`write_stream()`]. @@ -44,8 +50,24 @@ pub enum Format { /// /// Use it as well if a custom container format is desired. The idea is to decode it on a separate thread /// to rewrite the data to the desired format. - #[cfg(feature = "tar")] Tar, + /// A convenience format that will `gzip` deflate the `tar` stream, using the default compression level. + // TODO: figure out how to do this with `libflate`. + TarGz, + /// A standard `zip` archive. Note that this format silently converts illformed UTF-8 to UTF-8, which will + /// equal a change of path. + /// + /// Requires the `zip` feature toggle to have an effect. + /// + /// ### Shortcoming + /// + /// Even though symlinks are stored as such, for some reason at least on MacOS those aren't restored. That works, + /// however, when letting `git` create the archive. + Zip { + /// If `None`, use the default compression level. Otherwise use the given one which + /// ranges from 0-9 for the deflate algorithm. + compression_level: Option, + }, } /// Options for configuring [`write_stream()`]. @@ -57,10 +79,10 @@ pub struct Options { /// /// Note that that `/` should be used as separator, and that a prefix directory has to end with `/`. pub tree_prefix: Option, - /// The modification time for all entries in the archive. + /// The modification time for all entries in the archive as seen since UNIX epoch. /// /// Defaults to the current time. The caller may set this to the commit time if available. - pub modification_time: std::time::SystemTime, + pub modification_time: gix_date::SecondsSinceUnixEpoch, } impl Default for Options { @@ -68,10 +90,13 @@ impl Default for Options { Options { format: Default::default(), tree_prefix: None, - modification_time: std::time::SystemTime::now(), + modification_time: std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|t| t.as_secs() as i64) + .unwrap_or_default(), } } } mod write; -pub use write::write_stream; +pub use write::{write_stream, write_stream_seek}; diff --git a/gix-archive/src/write.rs b/gix-archive/src/write.rs index 2671fb88eed..b93937237e3 100644 --- a/gix-archive/src/write.rs +++ b/gix-archive/src/write.rs @@ -1,7 +1,8 @@ use crate::{Error, Format, Options}; use gix_worktree_stream::{Entry, Stream}; -/// Write all stream entries in `stream` as provided by `next_entry(stream)` to `out` configured according to `opts`. +/// Write all stream entries in `stream` as provided by `next_entry(stream)` to `out` configured according to `opts` which +/// also includes the streaming format. /// /// ### Performance /// @@ -21,66 +22,69 @@ where if opts.format == Format::InternalTransientNonPersistable { return Err(Error::InternalFormatMustNotPersist); } - #[cfg(feature = "tar")] + #[cfg(any(feature = "tar", feature = "tar_gz"))] { enum State { #[cfg(feature = "tar")] Tar((tar::Builder, Vec)), + #[cfg(feature = "tar_gz")] + TarGz((tar::Builder>, Vec)), } impl State { - pub fn new(format: Format, out: W) -> Self { - match format { + pub fn new(format: Format, out: W) -> Result { + Ok(match format { Format::InternalTransientNonPersistable => unreachable!("handled earlier"), + Format::Zip { .. } => return Err(Error::ZipWithoutSeek), #[cfg(feature = "tar")] - Format::Tar => State::Tar(( + Format::Tar => { + #[cfg(feature = "tar")] { - let mut ar = tar::Builder::new(out); - ar.mode(tar::HeaderMode::Deterministic); - ar - }, - Vec::with_capacity(64 * 1024), - )), - } + State::Tar(( + { + let mut ar = tar::Builder::new(out); + ar.mode(tar::HeaderMode::Deterministic); + ar + }, + Vec::with_capacity(64 * 1024), + )) + } + #[cfg(not(feature = "tar"))] + { + Err(Error::SupportNotCompiledIn { wanted: Format::Tar }) + } + } + Format::TarGz => { + #[cfg(feature = "tar_gz")] + { + State::TarGz(( + { + let mut ar = tar::Builder::new(libflate::gzip::Encoder::new(out)?); + ar.mode(tar::HeaderMode::Deterministic); + ar + }, + Vec::with_capacity(64 * 1024), + )) + } + #[cfg(not(feature = "tar_gz"))] + { + Err(Error::SupportNotCompiledIn { wanted: Format::TarGz }) + } + } + }) } } - let mut state = State::new(opts.format, out); - let mtime_seconds_since_epoch = opts - .modification_time - .duration_since(std::time::UNIX_EPOCH) - .ok() - .map(|d| d.as_secs()); - - while let Some(mut entry) = next_entry(stream)? { + let mut state = State::new(opts.format, out)?; + while let Some(entry) = next_entry(stream)? { match &mut state { #[cfg(feature = "tar")] State::Tar((ar, buf)) => { - let mut header = tar::Header::new_gnu(); - if let Some(mtime) = mtime_seconds_since_epoch { - header.set_mtime(mtime); - } - header.set_entry_type(tar_entry_type(entry.mode)); - header.set_mode(if matches!(entry.mode, gix_object::tree::EntryMode::BlobExecutable) { - 0o755 - } else { - 0o644 - }); - buf.clear(); - std::io::copy(&mut entry, buf)?; - - let path = gix_path::from_bstr(add_prefix(entry.relative_path(), opts.tree_prefix.as_ref())); - header.set_size(buf.len() as u64); - - if entry.mode == gix_object::tree::EntryMode::Link { - use bstr::ByteSlice; - let target = gix_path::from_bstr(buf.as_bstr()); - header.set_entry_type(tar::EntryType::Symlink); - header.set_size(0); - ar.append_link(&mut header, path, target)?; - } else { - ar.append_data(&mut header, path, buf.as_slice())?; - } + append_tar_entry(ar, buf, entry, opts.modification_time, &opts)?; + } + #[cfg(feature = "tar_gz")] + State::TarGz((ar, buf)) => { + append_tar_entry(ar, buf, entry, opts.modification_time, &opts)?; } } } @@ -90,12 +94,134 @@ where State::Tar((mut ar, _)) => { ar.finish()?; } + #[cfg(feature = "tar_gz")] + State::TarGz((ar, _)) => { + ar.into_inner()?.finish(); + } + } + } + Ok(()) +} + +/// Like [`write_stream()`], but requires [`std::io::Seek`] for `out`. +/// +/// Note that `zip` is able to stream big files, which our `tar` implementation is not able to do, which makes it the +/// only suitable container to support huge files from `git-lfs` without consuming excessive amounts of memory. +#[cfg_attr(not(feature = "zip"), allow(unused_mut, unused_variables))] +pub fn write_stream_seek( + stream: &mut Stream, + mut next_entry: NextFn, + out: impl std::io::Write + std::io::Seek, + opts: Options, +) -> Result<(), Error> +where + NextFn: FnMut(&mut Stream) -> Result>, gix_worktree_stream::entry::Error>, +{ + let compression_level = match opts.format { + Format::Zip { compression_level } => compression_level.map(|lvl| lvl as i32), + _other => return write_stream(stream, next_entry, out, opts), + }; + + #[cfg(feature = "zip")] + { + let mut ar = zip::write::ZipWriter::new(out); + let mut buf = Vec::new(); + let mtime = time::OffsetDateTime::from_unix_timestamp(opts.modification_time) + .map_err(|err| Error::InvalidModificationTime(Box::new(err)))? + .try_into() + .map_err(|err| Error::InvalidModificationTime(Box::new(err)))?; + while let Some(entry) = next_entry(stream)? { + append_zip_entry( + &mut ar, + entry, + &mut buf, + mtime, + compression_level, + opts.tree_prefix.as_ref(), + )?; } + ar.finish() + .map_err(|err| std::io::Error::new(std::io::ErrorKind::Other, err))?; + } + + Ok(()) +} + +#[cfg(feature = "zip")] +fn append_zip_entry( + ar: &mut zip::write::ZipWriter, + mut entry: gix_worktree_stream::Entry<'_>, + buf: &mut Vec, + mtime: zip::DateTime, + compression_level: Option, + tree_prefix: Option<&bstr::BString>, +) -> Result<(), Error> { + let file_opts = zip::write::FileOptions::default() + .compression_method(zip::CompressionMethod::Deflated) + .compression_level(compression_level) + .large_file(entry.bytes_remaining().map_or(true, |len| len >= 4_000_000_000)) + .last_modified_time(mtime) + .unix_permissions(if matches!(entry.mode, gix_object::tree::EntryMode::BlobExecutable) { + 0o755 + } else { + 0o644 + }); + let path = add_prefix(entry.relative_path(), tree_prefix).into_owned(); + match entry.mode { + gix_object::tree::EntryMode::Blob | gix_object::tree::EntryMode::BlobExecutable => { + ar.start_file(path.to_string(), file_opts) + .map_err(|err| std::io::Error::new(std::io::ErrorKind::Other, err))?; + std::io::copy(&mut entry, ar)?; + } + gix_object::tree::EntryMode::Tree | gix_object::tree::EntryMode::Commit => { + ar.add_directory(path.to_string(), file_opts) + .map_err(|err| std::io::Error::new(std::io::ErrorKind::Other, err))?; + } + gix_object::tree::EntryMode::Link => { + use bstr::ByteSlice; + std::io::copy(&mut entry, buf)?; + ar.add_symlink(path.to_string(), buf.as_bstr().to_string(), file_opts) + .map_err(|err| std::io::Error::new(std::io::ErrorKind::Other, err))?; + } + } + Ok(()) +} + +#[cfg(any(feature = "tar", feature = "tar_gz"))] +fn append_tar_entry( + ar: &mut tar::Builder, + buf: &mut Vec, + mut entry: gix_worktree_stream::Entry<'_>, + mtime_seconds_since_epoch: i64, + opts: &Options, +) -> Result<(), Error> { + let mut header = tar::Header::new_gnu(); + header.set_mtime(mtime_seconds_since_epoch as u64); + header.set_entry_type(tar_entry_type(entry.mode)); + header.set_mode(if matches!(entry.mode, gix_object::tree::EntryMode::BlobExecutable) { + 0o755 + } else { + 0o644 + }); + buf.clear(); + std::io::copy(&mut entry, buf)?; + + let path = gix_path::from_bstr(add_prefix(entry.relative_path(), opts.tree_prefix.as_ref())); + header.set_size(buf.len() as u64); + + if entry.mode == gix_object::tree::EntryMode::Link { + use bstr::ByteSlice; + let target = gix_path::from_bstr(buf.as_bstr()); + header.set_entry_type(tar::EntryType::Symlink); + header.set_size(0); + ar.append_link(&mut header, path, target)?; + } else { + ar.append_data(&mut header, path, buf.as_slice())?; } Ok(()) } -#[cfg(feature = "tar")] +#[cfg(any(feature = "tar", feature = "tar_gz"))] fn tar_entry_type(mode: gix_object::tree::EntryMode) -> tar::EntryType { use gix_object::tree::EntryMode; use tar::EntryType; @@ -107,7 +233,7 @@ fn tar_entry_type(mode: gix_object::tree::EntryMode) -> tar::EntryType { } } -#[cfg(feature = "tar")] +#[cfg(any(feature = "tar", feature = "tar_gz"))] fn add_prefix<'a>(relative_path: &'a bstr::BStr, prefix: Option<&bstr::BString>) -> std::borrow::Cow<'a, bstr::BStr> { use std::borrow::Cow; match prefix { diff --git a/gix-archive/tests/archive.rs b/gix-archive/tests/archive.rs index a58ba972031..83878939200 100644 --- a/gix-archive/tests/archive.rs +++ b/gix-archive/tests/archive.rs @@ -132,6 +132,63 @@ mod from_tree { }) } + #[test] + #[cfg(feature = "tar_gz")] + fn basic_usage_tar_gz() -> gix_testtools::Result { + basic_usage(gix_archive::Format::TarGz, |buf| { + assert!( + buf.len() < 385, + "quite a bit smaller than uncompressed: {} < 385", + buf.len() + ); + Ok(()) + }) + } + + #[test] + #[cfg(feature = "zip")] + fn basic_usage_zip() -> gix_testtools::Result { + basic_usage( + gix_archive::Format::Zip { + compression_level: Some(9), + }, + |buf| { + assert!( + buf.len() < 1200, + "bigger than uncompressed for some reason: {} < 1200", + buf.len() + ); + let mut ar = zip::ZipArchive::new(std::io::Cursor::new(buf.as_slice()))?; + assert_eq!( + { + let mut n: Vec<_> = ar.file_names().collect(); + n.sort(); + n + }, + &[ + "prefix/.gitattributes", + "prefix/a", + "prefix/dir/b", + "prefix/dir/subdir/exe", + "prefix/extra-dir-empty/", + "prefix/extra-dir/symlink-to-extra", + "prefix/extra-exe", + "prefix/extra-file", + "prefix/symlink-to-a" + ] + ); + let mut link = ar.by_name("prefix/symlink-to-a")?; + assert!(!link.is_dir()); + assert!(link.is_file(), "no symlink differentiation"); + assert_eq!(link.unix_mode(), Some(0o120644), "it's all in the mode"); + let mut buf = Vec::new(); + link.read_to_end(&mut buf)?; + assert_eq!(buf.as_bstr(), "a"); + Ok(()) + }, + ) + } + fn basic_usage( format: gix_archive::Format, make_assertion: impl FnOnce(Vec) -> gix_testtools::Result, @@ -161,16 +218,29 @@ mod from_tree { if format == Format::InternalTransientNonPersistable { std::io::copy(&mut stream.into_read(), &mut buf)?; } else { - gix_archive::write_stream( - &mut stream, - gix_worktree_stream::Stream::next_entry, - &mut buf, - gix_archive::Options { - format, - tree_prefix: Some("prefix/".into()), - modification_time: std::time::UNIX_EPOCH + std::time::Duration::from_secs(120), - }, - )?; + if matches!(format, Format::Zip { .. }) { + gix_archive::write_stream_seek( + &mut stream, + gix_worktree_stream::Stream::next_entry, + std::io::Cursor::new(&mut buf), + gix_archive::Options { + format, + tree_prefix: Some("prefix/".into()), + modification_time: 1820000000, // needs to be within a certain bound to be a valid MSDos time! + }, + )?; + } else { + gix_archive::write_stream( + &mut stream, + gix_worktree_stream::Stream::next_entry, + &mut buf, + gix_archive::Options { + format, + tree_prefix: Some("prefix/".into()), + modification_time: 120, + }, + )?; + } assert!( stream.next_entry()?.is_none(), "stream is exhausted, all written to buf" diff --git a/justfile b/justfile index f7feac137c3..d5d05f18ac0 100755 --- a/justfile +++ b/justfile @@ -133,6 +133,8 @@ unit-tests: cargo test --all cargo test -p gix-archive --no-default-features cargo test -p gix-archive --features tar + cargo test -p gix-archive --features tar_gz + cargo test -p gix-archive --features zip cd gix-object; \ set -ex; \ cargo test; \ From 61aed0e955974f65f4fea042cbae68ea8a2cc2f5 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 21 Jul 2023 22:04:37 +0200 Subject: [PATCH 15/18] Change archive implementation to require the seek bound. Only that way zip can be supported, and it seems not worth it to support non-seek versions of it. --- gix/src/repository/worktree.rs | 4 ++-- gix/tests/repository/worktree.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gix/src/repository/worktree.rs b/gix/src/repository/worktree.rs index bfdb093078b..c182e624342 100644 --- a/gix/src/repository/worktree.rs +++ b/gix/src/repository/worktree.rs @@ -110,7 +110,7 @@ impl crate::Repository { pub fn worktree_archive( &self, mut stream: gix_worktree_stream::Stream, - out: impl std::io::Write, + out: impl std::io::Write + std::io::Seek, mut blobs: impl gix_features::progress::Progress, should_interrupt: &std::sync::atomic::AtomicBool, options: gix_archive::Options, @@ -123,7 +123,7 @@ impl crate::Repository { std::io::copy(&mut stream.into_read(), &mut out)?; return Ok(()); } - gix_archive::write_stream( + gix_archive::write_stream_seek( &mut stream, |stream| { if should_interrupt.load(std::sync::atomic::Ordering::Relaxed) { diff --git a/gix/tests/repository/worktree.rs b/gix/tests/repository/worktree.rs index a63c58d84ce..944a4cebe83 100644 --- a/gix/tests/repository/worktree.rs +++ b/gix/tests/repository/worktree.rs @@ -22,7 +22,7 @@ fn archive() -> crate::Result { repo.worktree_archive( stream, - &mut buf, + std::io::Cursor::new(&mut buf), gix_features::progress::Discard, &std::sync::atomic::AtomicBool::default(), Default::default(), From 567b1a4488c43c1f7099435d10cdddbc3a98a5cc Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 21 Jul 2023 17:25:24 +0200 Subject: [PATCH 16/18] Add compression support to `gix archive`, which is where it should shine. --- Cargo.toml | 2 +- deny.toml | 3 +- gitoxide-core/Cargo.toml | 2 +- gitoxide-core/src/repository/archive.rs | 37 ++++++++++++------------- src/plumbing/main.rs | 7 ++++- src/plumbing/options/mod.rs | 12 ++++++-- 6 files changed, 37 insertions(+), 26 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 03c0b44c95b..35234c17aa6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -202,7 +202,7 @@ overflow-checks = false # this bloats files but assures destructors are called, important for tempfiles. One day I hope we # can wire up the 'abrt' signal handler so tempfiles will be removed in case of panics. panic = 'unwind' -codegen-units = 1 +#codegen-units = 1 incremental = false build-override = { opt-level = 0 } diff --git a/deny.toml b/deny.toml index a495ea435c5..8a800755120 100644 --- a/deny.toml +++ b/deny.toml @@ -42,7 +42,8 @@ allow = [ "MIT-0", "ISC", "Unicode-DFS-2016", - "LicenseRef-ring" + "LicenseRef-ring", + "Zlib" ] # Lint level for licenses considered copyleft copyleft = "allow" diff --git a/gitoxide-core/Cargo.toml b/gitoxide-core/Cargo.toml index eafe20907f9..4307c332bc1 100644 --- a/gitoxide-core/Cargo.toml +++ b/gitoxide-core/Cargo.toml @@ -47,7 +47,7 @@ serde = ["gix/serde", "dep:serde_json", "dep:serde", "bytesize/serde"] gix = { version = "^0.49.1", path = "../gix", default-features = false } gix-pack-for-configuration-only = { package = "gix-pack", version = "^0.40.0", path = "../gix-pack", default-features = false, features = ["pack-cache-lru-dynamic", "pack-cache-lru-static"] } gix-transport-configuration-only = { package = "gix-transport", version = "^0.34.0", path = "../gix-transport", default-features = false } -gix-archive-for-configuration-only = { package = "gix-archive", version = "^0.2.0", path = "../gix-archive", optional = true, features = ["tar"] } +gix-archive-for-configuration-only = { package = "gix-archive", version = "^0.2.0", path = "../gix-archive", optional = true, features = ["tar", "tar_gz"] } serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] } anyhow = "1.0.42" thiserror = "1.0.34" diff --git a/gitoxide-core/src/repository/archive.rs b/gitoxide-core/src/repository/archive.rs index 32e5949f085..b9e048b7ea3 100644 --- a/gitoxide-core/src/repository/archive.rs +++ b/gitoxide-core/src/repository/archive.rs @@ -1,12 +1,11 @@ use anyhow::bail; use gix::worktree::archive; use gix::Progress; -use std::ops::Add; use std::path::Path; pub fn stream( repo: gix::Repository, - destination_path: Option<&Path>, + destination_path: &Path, rev_spec: Option<&str>, mut progress: impl Progress, format: Option, @@ -24,13 +23,7 @@ pub fn stream( bytes.init(None, gix::progress::bytes()); let mut file = gix::progress::Write { - inner: match destination_path { - Some(path) => Box::new(std::io::BufWriter::with_capacity( - 128 * 1024, - std::fs::File::create(path)?, - )) as Box, - None => Box::new(std::io::sink()), - }, + inner: std::io::BufWriter::with_capacity(128 * 1024, std::fs::File::create(destination_path)?), progress: &mut bytes, }; repo.worktree_archive( @@ -41,9 +34,12 @@ pub fn stream( gix::worktree::archive::Options { format, tree_prefix: None, - modification_time: modification_date - .map(|t| std::time::UNIX_EPOCH.add(std::time::Duration::from_secs(t as u64))) - .unwrap_or_else(std::time::SystemTime::now), + modification_time: modification_date.unwrap_or_else(|| { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_secs()) + .unwrap_or_default() as gix::date::SecondsSinceUnixEpoch + }), }, )?; @@ -67,14 +63,15 @@ fn fetch_rev_info( }) } -fn format_from_ext(path: Option<&Path>) -> anyhow::Result { - Ok(match path { - Some(path) => match path.extension().and_then(|ext| ext.to_str()) { - None => bail!("Cannot derive archive format from a file without extension"), - Some("tar") => archive::Format::Tar, - Some("stream") => archive::Format::InternalTransientNonPersistable, - Some(ext) => bail!("Format for extendion '{ext}' is unsupported"), +fn format_from_ext(path: &Path) -> anyhow::Result { + Ok(match path.extension().and_then(std::ffi::OsStr::to_str) { + None => bail!("Cannot derive archive format from a file without extension"), + Some("tar") => archive::Format::Tar, + Some("gz") => archive::Format::TarGz, + Some("zip") => archive::Format::Zip { + compression_level: None, }, - None => archive::Format::InternalTransientNonPersistable, + Some("stream") => archive::Format::InternalTransientNonPersistable, + Some(ext) => bail!("Format for extension '{ext}' is unsupported"), }) } diff --git a/src/plumbing/main.rs b/src/plumbing/main.rs index 67aa6400b62..809564087b1 100644 --- a/src/plumbing/main.rs +++ b/src/plumbing/main.rs @@ -138,6 +138,7 @@ pub fn main() -> Result<()> { #[cfg(feature = "gitoxide-core-tools-archive")] Subcommands::Archive(crate::plumbing::options::archive::Platform { format, + compression_level, output_file, treeish, }) => prepare_and_run( @@ -150,7 +151,7 @@ pub fn main() -> Result<()> { move |progress, _out, _err| { core::repository::archive::stream( repository(Mode::Lenient)?, - output_file.as_deref(), + &output_file, treeish.as_deref(), progress, format.map(|f| match f { @@ -158,6 +159,10 @@ pub fn main() -> Result<()> { gix::worktree::archive::Format::InternalTransientNonPersistable } crate::plumbing::options::archive::Format::Tar => gix::worktree::archive::Format::Tar, + crate::plumbing::options::archive::Format::TarGz => gix::worktree::archive::Format::TarGz, + crate::plumbing::options::archive::Format::Zip => { + gix::worktree::archive::Format::Zip { compression_level } + } }), ) }, diff --git a/src/plumbing/options/mod.rs b/src/plumbing/options/mod.rs index 79333922d53..0d5975fdc57 100644 --- a/src/plumbing/options/mod.rs +++ b/src/plumbing/options/mod.rs @@ -145,16 +145,24 @@ pub mod archive { Internal, /// Use the `.tar` file format, uncompressed. Tar, + /// Use the `.tar.gz` file format, compressed with `gzip`. + TarGz, + /// Use the `.zip` container format. + Zip, } #[derive(Debug, clap::Parser)] pub struct Platform { + /// Explicitly set the format. Otherwise derived from the suffix of the output file. #[clap(long, short = 'f', value_enum)] pub format: Option, - /// The file to write the archive to, or discard the output immediately. + /// The compression strength to use. Currently only used for `.zip` archives, valid from 0-9. + #[clap(long, short = 'c', value_enum)] + pub compression_level: Option, + /// The file to write the archive to. /// /// It's extension determines the archive format, unless `--format` is set. - pub output_file: Option, + pub output_file: PathBuf, /// The revspec of the commit or tree to traverse, or the tree at `HEAD` if unspecified. /// From 4a9d0f1df6f7a20f7f1769fd9c7dae2e53f7e83f Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 22 Jul 2023 08:25:34 +0200 Subject: [PATCH 17/18] Allow to create additional entries via the command-line --- gitoxide-core/src/repository/archive.rs | 39 +++++++++++++++++++++---- src/plumbing/main.rs | 37 ++++++++++++++++------- src/plumbing/options/mod.rs | 11 ++++++- 3 files changed, 71 insertions(+), 16 deletions(-) diff --git a/gitoxide-core/src/repository/archive.rs b/gitoxide-core/src/repository/archive.rs index b9e048b7ea3..3494aacbacc 100644 --- a/gitoxide-core/src/repository/archive.rs +++ b/gitoxide-core/src/repository/archive.rs @@ -1,21 +1,50 @@ -use anyhow::bail; +use anyhow::{anyhow, bail}; use gix::worktree::archive; use gix::Progress; -use std::path::Path; +use std::path::{Path, PathBuf}; + +pub struct Options { + pub format: Option, + pub files: Vec<(String, String)>, + pub prefix: Option, + pub add_paths: Vec, +} pub fn stream( repo: gix::Repository, destination_path: &Path, rev_spec: Option<&str>, mut progress: impl Progress, - format: Option, + Options { + format, + prefix, + add_paths, + files, + }: Options, ) -> anyhow::Result<()> { let format = format.map_or_else(|| format_from_ext(destination_path), Ok)?; let object = repo.rev_parse_single(rev_spec.unwrap_or("HEAD"))?.object()?; let (modification_date, tree) = fetch_rev_info(object)?; let start = std::time::Instant::now(); - let (stream, index) = repo.worktree_stream(tree)?; + let (mut stream, index) = repo.worktree_stream(tree)?; + if !add_paths.is_empty() { + let root = gix::path::realpath( + repo.work_dir() + .ok_or_else(|| anyhow!("Adding files requires a worktree directory that contains them"))?, + )?; + for path in add_paths { + stream.add_entry_from_path(&root, &gix::path::realpath(path)?)?; + } + } + for (path, content) in files { + stream.add_entry(gix::worktree::stream::AdditionalEntry { + id: gix::hash::Kind::Sha1.null(), + mode: gix::object::tree::EntryMode::Blob, + relative_path: path.into(), + source: gix::worktree::stream::entry::Source::Memory(content.into()), + }); + } let mut entries = progress.add_child("entries"); entries.init(Some(index.entries().len()), gix::progress::count("entries")); @@ -33,7 +62,7 @@ pub fn stream( &gix::interrupt::IS_INTERRUPTED, gix::worktree::archive::Options { format, - tree_prefix: None, + tree_prefix: prefix.map(gix::bstr::BString::from), modification_time: modification_date.unwrap_or_else(|| { std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) diff --git a/src/plumbing/main.rs b/src/plumbing/main.rs index 809564087b1..008705dc80d 100644 --- a/src/plumbing/main.rs +++ b/src/plumbing/main.rs @@ -138,7 +138,10 @@ pub fn main() -> Result<()> { #[cfg(feature = "gitoxide-core-tools-archive")] Subcommands::Archive(crate::plumbing::options::archive::Platform { format, + prefix, compression_level, + add_path, + add_virtual_file, output_file, treeish, }) => prepare_and_run( @@ -149,21 +152,35 @@ pub fn main() -> Result<()> { progress_keep_open, None, move |progress, _out, _err| { + if add_virtual_file.len() % 2 != 0 { + anyhow::bail!( + "Virtual files must be specified in pairs of two: slash/separated/path content, got {}", + add_virtual_file.join(", ") + ) + } core::repository::archive::stream( repository(Mode::Lenient)?, &output_file, treeish.as_deref(), progress, - format.map(|f| match f { - crate::plumbing::options::archive::Format::Internal => { - gix::worktree::archive::Format::InternalTransientNonPersistable - } - crate::plumbing::options::archive::Format::Tar => gix::worktree::archive::Format::Tar, - crate::plumbing::options::archive::Format::TarGz => gix::worktree::archive::Format::TarGz, - crate::plumbing::options::archive::Format::Zip => { - gix::worktree::archive::Format::Zip { compression_level } - } - }), + core::repository::archive::Options { + add_paths: add_path, + prefix, + files: add_virtual_file + .chunks(2) + .map(|c| (c[0].to_owned(), c[1].clone())) + .collect(), + format: format.map(|f| match f { + crate::plumbing::options::archive::Format::Internal => { + gix::worktree::archive::Format::InternalTransientNonPersistable + } + crate::plumbing::options::archive::Format::Tar => gix::worktree::archive::Format::Tar, + crate::plumbing::options::archive::Format::TarGz => gix::worktree::archive::Format::TarGz, + crate::plumbing::options::archive::Format::Zip => { + gix::worktree::archive::Format::Zip { compression_level } + } + }), + }, ) }, ), diff --git a/src/plumbing/options/mod.rs b/src/plumbing/options/mod.rs index 0d5975fdc57..ca39dcae243 100644 --- a/src/plumbing/options/mod.rs +++ b/src/plumbing/options/mod.rs @@ -156,9 +156,18 @@ pub mod archive { /// Explicitly set the format. Otherwise derived from the suffix of the output file. #[clap(long, short = 'f', value_enum)] pub format: Option, + /// Apply the prefix verbatim to any path we add to the archive. Use a trailing `/` if prefix is a directory. + #[clap(long)] + pub prefix: Option, /// The compression strength to use. Currently only used for `.zip` archives, valid from 0-9. - #[clap(long, short = 'c', value_enum)] + #[clap(long, short = 'l', value_enum)] pub compression_level: Option, + /// Add the given path to the archive. Directories will always be empty. + #[clap(long, short = 'p')] + pub add_path: Vec, + /// Add the new file from a slash-separated path, which must happen in pairs of two, first the path, then the content. + #[clap(long, short = 'v')] + pub add_virtual_file: Vec, /// The file to write the archive to. /// /// It's extension determines the archive format, unless `--format` is set. From 44d9df4abfd555f8bc386c050c7ece0108e80558 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 22 Jul 2023 09:48:58 +0200 Subject: [PATCH 18/18] feat: add `workspace-stream` task This way we can decode workspace checkouts entirely in memory. --- crate-status.md | 19 +++++++------- gitoxide-core/src/corpus/engine.rs | 29 ++++++++++++++++++++-- gitoxide-core/src/corpus/run.rs | 40 ++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+), 11 deletions(-) diff --git a/crate-status.md b/crate-status.md index f8a4e87d968..30a2673eb73 100644 --- a/crate-status.md +++ b/crate-status.md @@ -600,7 +600,7 @@ See its [README.md](https://github.com/Byron/gitoxide/blob/main/gix-lock/README. ### gix * [x] utilities for applications to make long running operations interruptible gracefully and to support timeouts in servers. -* [ ] handle `core.repositoryFormatVersion` and extensions +* [x] handle `core.repositoryFormatVersion` and extensions * [x] support for unicode-precomposition of command-line arguments (needs explicit use in parent application) * **Repository** * [x] discovery @@ -692,15 +692,16 @@ See its [README.md](https://github.com/Byron/gitoxide/blob/main/gix-lock/README. * [ ] checkout with conversions like clean + smudge as in `.gitattributes` * [ ] _diff_ index with working tree * [ ] sparse checkout support - * [ ] read per-worktree config if `extensions.worktreeConfig` is enabled. + * [x] read per-worktree config if `extensions.worktreeConfig` is enabled. * **index** * [ ] tree from index - * [ ] index from tree + * [x] index from tree * **worktrees** * [x] open a repository with worktrees * [x] read locked state * [ ] obtain 'prunable' information * [x] proper handling of worktree related refs + * [x] create a byte stream and create archives for such a stream, including worktree filters and conversions * [ ] create, move, remove, and repair * [x] access exclude information * [x] access attribute information @@ -727,20 +728,20 @@ See its [README.md](https://github.com/Byron/gitoxide/blob/main/gix-lock/README. ### gix-worktree-stream * [x] encode git-tree as stream of bytes (with large file support and actual streaming) -* [x] decode bytes into entries +* [x] produce a stream of entries * [x] add custom entries to the stream * [x] respect `export-ignore` git attribute * [x] apply standard worktree conversion to simulate an actual checkout +* [ ] support for submodule inclusion * [x] API documentation * [ ] Some examples ### gix-archive -* [ ] `write_to()` for creating an archive with various container formats -* [ ] add custom entries to the archive -* [ ] handling of archive specific filters -* [ ] filter files of tree (with access to attributes) -* [ ] support for worktree conversion (via attributes and filters) +* [x] `write_to()` for creating an archive with various container formats + * [x] `tar` and `tar.gz` + * [x] `zip` +* [x] add prefix and modification date * [ ] API documentation * [ ] Some examples diff --git a/gitoxide-core/src/corpus/engine.rs b/gitoxide-core/src/corpus/engine.rs index ac443be92c0..d591a5970d8 100644 --- a/gitoxide-core/src/corpus/engine.rs +++ b/gitoxide-core/src/corpus/engine.rs @@ -1,3 +1,4 @@ +use std::sync::atomic::AtomicUsize; use std::{ path::{Path, PathBuf}, sync::atomic::Ordering, @@ -111,6 +112,7 @@ impl Engine { self.state.reverse_trace_lines, )?; + let mut num_errors = 0; for repo in &repos { if gix::interrupt::is_triggered() { bail!("interrupted by user"); @@ -135,14 +137,25 @@ impl Engine { &gix::interrupt::IS_INTERRUPTED, ); }); + if let Some(err) = run.error.as_deref() { + num_errors += 1; + repo_progress.fail(err.to_owned()); + } Self::update_run(&self.con, run)?; repo_progress.inc(); } repo_progress.show_throughput(task_start); + if num_errors != 0 { + repo_progress.fail(format!( + "{} repositories failed to run task {}", + num_errors, task.short_name + )); + } } else { let counter = repo_progress.counter(); + let num_errors = AtomicUsize::default(); let repo_progress = gix::threading::OwnShared::new(gix::threading::Mutable::new( - repo_progress.add_child("will be changed"), + repo_progress.add_child("in parallel"), )); gix::parallel::in_parallel_with_slice( &mut repos, @@ -189,6 +202,10 @@ impl Engine { tracing::info_span!("run", run_id = run.id).in_scope(|| { task.perform(&mut run, &repo.path, progress, Some(1), should_interrupt); }); + if let Some(err) = run.error.as_deref() { + num_errors.fetch_add(1, Ordering::SeqCst); + progress.fail(err.to_owned()); + } Self::update_run(con, run)?; if let Some(counter) = counter.as_ref() { counter.fetch_add(1, Ordering::SeqCst); @@ -198,7 +215,15 @@ impl Engine { || (!gix::interrupt::is_triggered()).then(|| Duration::from_millis(100)), std::convert::identity, )?; - gix::threading::lock(&repo_progress).show_throughput(task_start); + let repo_progress = gix::threading::lock(&repo_progress); + repo_progress.show_throughput(task_start); + let num_errors = num_errors.load(Ordering::Relaxed); + if num_errors != 0 { + repo_progress.fail(format!( + "{} repositories failed to run task {}", + num_errors, task.short_name + )); + } } repo_progress.inc(); diff --git a/gitoxide-core/src/corpus/run.rs b/gitoxide-core/src/corpus/run.rs index 7ff984add30..fb924fc1d05 100644 --- a/gitoxide-core/src/corpus/run.rs +++ b/gitoxide-core/src/corpus/run.rs @@ -36,6 +36,13 @@ pub(crate) trait Execute { } pub(crate) static ALL: &[Task] = &[ + #[cfg(feature = "archive")] + Task { + short_name: "SWTR", + description: "stream worktree", + execute_exclusive: false, + execute: &WorktreeStream, + }, Task { short_name: "OPNR", description: "open repository (isolated)", @@ -56,6 +63,39 @@ pub(crate) static ALL: &[Task] = &[ }, ]; +#[cfg(feature = "archive")] +struct WorktreeStream; + +#[cfg(feature = "archive")] +impl Execute for WorktreeStream { + fn execute( + &self, + repo: &Path, + progress: &mut corpus::engine::ProgressItem, + _threads: Option, + should_interrupt: &AtomicBool, + ) -> anyhow::Result<()> { + use gix::Progress; + let repo = gix::open_opts(repo, gix::open::Options::isolated())?; + let (stream, _) = { + let _span = gix::trace::coarse!("read index and create worktree stream"); + repo.worktree_stream(repo.head_commit()?.tree_id()?)? + }; + progress.init(None, gix::progress::bytes()); + std::io::copy( + &mut stream.into_read(), + &mut gix::features::interrupt::Write { + inner: gix::features::progress::Write { + inner: std::io::sink(), + progress, + }, + should_interrupt, + }, + )?; + Ok(()) + } +} + struct OpenRepo; impl Execute for OpenRepo {