diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c49221e4635..c58d5f071ef 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -205,7 +205,7 @@ jobs: name: crates without feature toggles - run: set +x; for feature in progress fs-walkdir-parallel parallel io-pipe crc32 zlib zlib-rust-backend fast-sha1 rustsha1 cache-efficiency-debug; do (cd gix-features && cargo build --features $feature --target ${{ matrix.target }}); done name: features of gix-features - - run: set +x; for name in gix-diff gix-pack; do (cd $name && cargo build --features wasm --target ${{ matrix.target }}); done + - run: set +x; for name in gix-pack; do (cd $name && cargo build --features wasm --target ${{ matrix.target }}); done name: crates with 'wasm' feature - run: cd gix-pack && cargo build --all-features --target ${{ matrix.target }} name: gix-pack with all features (including wasm) diff --git a/Cargo.lock b/Cargo.lock index 62c8e3102b2..046451adda6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -956,7 +956,7 @@ checksum = "d4029edd3e734da6fe05b6cd7bd2960760a616bd2ddd0d59a0124746d6272af0" dependencies = [ "cfg-if", "libc", - "redox_syscall", + "redox_syscall 0.3.5", "windows-sys", ] @@ -1525,8 +1525,15 @@ dependencies = [ "bstr", "document-features", "getrandom", + "gix-command", + "gix-filter", + "gix-fs 0.8.0", "gix-hash 0.13.1", "gix-object 0.38.0", + "gix-path 0.10.0", + "gix-tempfile 11.0.0 (registry+https://github.com/rust-lang/crates.io-index)", + "gix-trace 0.1.3", + "gix-worktree 0.27.0", "imara-diff", "serde", "thiserror", @@ -1537,11 +1544,15 @@ name = "gix-diff-tests" version = "0.0.0" dependencies = [ "gix-diff", + "gix-filter", + "gix-fs 0.8.0", "gix-hash 0.13.1", "gix-object 0.38.0", "gix-odb", "gix-testtools", "gix-traverse 0.34.0", + "gix-worktree 0.27.0", + "shell-words", ] [[package]] @@ -1614,6 +1625,17 @@ dependencies = [ "walkdir", ] +[[package]] +name = "gix-features" +version = "0.36.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51f4365ba17c4f218d7fd9ec102b8d2d3cb0ca200a835e81151ace7778aec827" +dependencies = [ + "gix-hash 0.13.1 (registry+https://github.com/rust-lang/crates.io-index)", + "gix-trace 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", + "libc", +] + [[package]] name = "gix-fetchhead" version = "0.0.0" @@ -1633,6 +1655,7 @@ dependencies = [ "gix-quote 0.4.7", "gix-testtools", "gix-trace 0.1.3", + "gix-utils 0.1.5", "gix-worktree 0.27.0", "once_cell", "smallvec", @@ -1657,6 +1680,15 @@ dependencies = [ "tempfile", ] +[[package]] +name = "gix-fs" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8cd171c0cae97cd0dc57e7b4601cb1ebf596450e263ef3c02be9107272c877bd" +dependencies = [ + "gix-features 0.36.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "gix-fsck" version = "0.1.0" @@ -1715,6 +1747,16 @@ dependencies = [ "thiserror", ] +[[package]] +name = "gix-hash" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1884c7b41ea0875217c1be9ce91322f90bde433e91d374d0e1276073a51ccc60" +dependencies = [ + "faster-hex", + "thiserror", +] + [[package]] name = "gix-hashtable" version = "0.2.4" @@ -1802,7 +1844,7 @@ dependencies = [ "itoa", "libc", "memmap2 0.9.0", - "rustix 0.38.20", + "rustix 0.38.21", "serde", "smallvec", "thiserror", @@ -2089,7 +2131,7 @@ dependencies = [ "gix-config-value", "gix-testtools", "parking_lot", - "rustix 0.38.20", + "rustix 0.38.21", "serial_test", "thiserror", ] @@ -2352,6 +2394,20 @@ dependencies = [ "tempfile", ] +[[package]] +name = "gix-tempfile" +version = "11.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05cc2205cf10d99f70b96e04e16c55d4c7cf33efc151df1f793e29fd12a931f8" +dependencies = [ + "dashmap", + "gix-fs 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)", + "libc", + "once_cell", + "parking_lot", + "tempfile", +] + [[package]] name = "gix-testtools" version = "0.13.0" @@ -2939,7 +2995,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" dependencies = [ "hermit-abi", - "rustix 0.38.20", + "rustix 0.38.21", "windows-sys", ] @@ -3466,7 +3522,7 @@ checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" dependencies = [ "cfg-if", "libc", - "redox_syscall", + "redox_syscall 0.3.5", "smallvec", "windows-targets 0.48.5", ] @@ -3717,6 +3773,15 @@ dependencies = [ "bitflags 1.3.2", ] +[[package]] +name = "redox_syscall" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" +dependencies = [ + "bitflags 1.3.2", +] + [[package]] name = "regex" version = "1.10.0" @@ -3865,9 +3930,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.20" +version = "0.38.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67ce50cb2e16c2903e30d1cbccfd8387a74b9d4c938b6a4c5ec6cc7556f7a8a0" +checksum = "2b426b0506e5d50a7d8dafcf2e81471400deb602392c7dd110815afb4eaf02a3" dependencies = [ "bitflags 2.4.0", "errno", @@ -4294,14 +4359,14 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.8.0" +version = "3.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb94d2f3cc536af71caac6b6fcebf65860b347e7ce0cc9ebe8f70d3e521054ef" +checksum = "7ef1adac450ad7f4b3c28589471ade84f25f731a7a0fe30d71dfa9f60fd808e5" dependencies = [ "cfg-if", "fastrand 2.0.1", - "redox_syscall", - "rustix 0.38.20", + "redox_syscall 0.4.1", + "rustix 0.38.21", "windows-sys", ] diff --git a/crate-status.md b/crate-status.md index a6aa1f119eb..406d594f110 100644 --- a/crate-status.md +++ b/crate-status.md @@ -293,16 +293,29 @@ The top-level crate that acts as hub to all functionality provided by the `gix-* Check out the [performance discussion][gix-diff-performance] as well. * **tree** - * [x] changes needed to obtain _other tree_ + * [x] changes needed to obtain _other tree_ * **patches** - * There are various ways to generate a patch from two blobs. - * [ ] any + * There are various ways to generate a patch from two blobs. + * [ ] text + * [ ] binary * **lines** - * [x] Simple line-by-line diffs powered by the `imara-diff` crate. -* diffing, merging, working with hunks of data -* find differences between various states, i.e. index, working tree, commit-tree + * [x] Simple line-by-line diffs powered by the `imara-diff` crate. +* **generic rename tracker to find renames and copies** + * [x] find by exact match + * [x] find by similarity check + * [ ] heuristics to find best candidate + * [ ] find by basename to help detecting simple moves +* **blob** + * [x] a choice of to-worktree, to-git and to-worktree-if-needed conversions + * [x] `textconv` filters + * [x] special handling of files beyond the big-file threshold. + * [x] detection of binary files by looking at header (first 8k bytes) + * [x] caching of diff-able data + * [x] prepare invocation of external diff program + - [ ] pass meta-info +* [ ] working with hunks of data * [x] API documentation - * [ ] Examples + * [ ] Examples [gix-diff-performance]: https://github.com/Byron/gitoxide/discussions/74 diff --git a/gitoxide-core/src/hours/core.rs b/gitoxide-core/src/hours/core.rs index 9728f6b5ecd..de8c01690be 100644 --- a/gitoxide-core/src/hours/core.rs +++ b/gitoxide-core/src/hours/core.rs @@ -8,7 +8,6 @@ use std::{ use gix::bstr::BStr; use itertools::Itertools; -use smallvec::SmallVec; use crate::hours::{ util::{add_lines, remove_lines}, @@ -92,25 +91,16 @@ pub fn spawn_tree_delta_threads<'scope>( move || -> Result<_, anyhow::Error> { let mut out = Vec::new(); let (commits, changes, lines_count) = stats_counters; - let mut attributes = line_stats + let mut cache = line_stats .then(|| -> anyhow::Result<_> { - repo.index_or_load_from_head().map_err(Into::into).and_then(|index| { - repo.attributes( - &index, - gix::worktree::stack::state::attributes::Source::IdMapping, - gix::worktree::stack::state::ignore::Source::IdMapping, - None, - ) - .map_err(Into::into) - .map(|attrs| { - let matches = attrs.selected_attribute_matches(["binary", "text"]); - (attrs, matches) - }) - }) + Ok(repo.diff_resource_cache(gix::diff::blob::pipeline::Mode::ToGit, Default::default())?) }) .transpose()?; for chunk in rx { for (commit_idx, parent_commit, commit) in chunk { + if let Some(cache) = cache.as_mut() { + cache.clear_resource_cache(); + } commits.fetch_add(1, Ordering::Relaxed); if gix::interrupt::is_triggered() { return Ok(out); @@ -155,47 +145,34 @@ pub fn spawn_tree_delta_threads<'scope>( previous_entry_mode, id, previous_id, - } => { - match (previous_entry_mode.is_blob(), entry_mode.is_blob()) { - (false, false) => {} - (false, true) => { - files.added += 1; - add_lines(line_stats, &lines_count, &mut lines, id); - } - (true, false) => { - files.removed += 1; - remove_lines(line_stats, &lines_count, &mut lines, previous_id); - } - (true, true) => { - files.modified += 1; - if let Some((attrs, matches)) = attributes.as_mut() { - let entry = attrs.at_entry(change.location, Some(false))?; - let is_text_file = if entry.matching_attributes(matches) { - let attrs: SmallVec<[_; 2]> = - matches.iter_selected().collect(); - let binary = &attrs[0]; - let text = &attrs[1]; - !binary.assignment.state.is_set() - && !text.assignment.state.is_unset() - } else { - // In the absence of binary or text markers, we assume it's text. - true - }; - - if let Some(Ok(diff)) = - is_text_file.then(|| change.event.diff()).flatten() - { - let mut nl = 0; - let counts = diff.line_counts(); - nl += counts.insertions as usize + counts.removals as usize; - lines.added += counts.insertions as usize; - lines.removed += counts.removals as usize; - lines_count.fetch_add(nl, Ordering::Relaxed); - } + } => match (previous_entry_mode.is_blob(), entry_mode.is_blob()) { + (false, false) => {} + (false, true) => { + files.added += 1; + add_lines(line_stats, &lines_count, &mut lines, id); + } + (true, false) => { + files.removed += 1; + remove_lines(line_stats, &lines_count, &mut lines, previous_id); + } + (true, true) => { + files.modified += 1; + if let Some(cache) = cache.as_mut() { + let mut diff = change.diff(cache).map_err(|err| { + std::io::Error::new(std::io::ErrorKind::Other, err) + })?; + let mut nl = 0; + if let Some(counts) = diff.line_counts().map_err(|err| { + std::io::Error::new(std::io::ErrorKind::Other, err) + })? { + nl += counts.insertions as usize + counts.removals as usize; + lines.added += counts.insertions as usize; + lines.removed += counts.removals as usize; + lines_count.fetch_add(nl, Ordering::Relaxed); } } } - } + }, } Ok::<_, std::io::Error>(Default::default()) })?; diff --git a/gitoxide-core/src/query/engine/update.rs b/gitoxide-core/src/query/engine/update.rs index 1dcf57ace7b..74d3ba42b1b 100644 --- a/gitoxide-core/src/query/engine/update.rs +++ b/gitoxide-core/src/query/engine/update.rs @@ -6,6 +6,7 @@ use std::{ }; use anyhow::{anyhow, bail}; +use gix::diff::blob::platform::prepare_diff::Operation; use gix::objs::find::Error; use gix::{ bstr::{BStr, BString, ByteSlice}, @@ -173,6 +174,9 @@ pub fn update( repo.object_cache_size_if_unset((object_cache_size_mb * 1024 * 1024) / threads); let rx = rx.clone(); move || -> anyhow::Result<()> { + let mut rewrite_cache = + repo.diff_resource_cache(gix::diff::blob::pipeline::Mode::ToGit, Default::default())?; + let mut diff_cache = rewrite_cache.clone(); for (chunk_id, chunk) in rx { let mut out_chunk = Vec::with_capacity(chunk.len()); for Task { @@ -201,10 +205,12 @@ pub fn update( Some(c) => c, None => continue, }; + rewrite_cache.clear_resource_cache(); + diff_cache.clear_resource_cache(); from.changes()? .track_path() .track_rewrites(Some(rewrites)) - .for_each_to_obtain_tree(&to, |change| { + .for_each_to_obtain_tree_with_cache(&to, &mut rewrite_cache, |change| { use gix::object::tree::diff::change::Event::*; change_counter.fetch_add(1, Ordering::SeqCst); match change.event { @@ -232,30 +238,47 @@ pub fn update( ); } (true, true) => { - // TODO: use git attributes here to know if it's a binary file or not. - if let Some(Ok(diff)) = change.event.diff() { - let mut nl = 0; - let tokens = diff.line_tokens(); - let counts = gix::diff::blob::diff( - diff.algo, - &tokens, - gix::diff::blob::sink::Counter::default(), - ); - nl += counts.insertions as usize - + counts.removals as usize; - let lines = LineStats { - added: counts.insertions as usize, - removed: counts.removals as usize, - before: tokens.before.len(), - after: tokens.after.len(), - }; - lines_counter.fetch_add(nl, Ordering::SeqCst); - out.push(FileChange { - relpath: change.location.to_owned(), - mode: FileMode::Modified, - source_relpath: None, - lines: Some(lines), - }); + if let Ok(cache) = + change.diff(&mut diff_cache).map(|p| p.resource_cache) + { + cache + .options + .skip_internal_diff_if_external_is_configured = + false; + if let Ok(prep) = cache.prepare_diff() { + let mut nl = 0; + let tokens = prep.interned_input(); + match prep.operation { + Operation::InternalDiff { algorithm } => { + let counts = gix::diff::blob::diff( + algorithm, + &tokens, + gix::diff::blob::sink::Counter::default( + ), + ); + nl += counts.insertions as usize + + counts.removals as usize; + let lines = LineStats { + added: counts.insertions as usize, + removed: counts.removals as usize, + before: tokens.before.len(), + after: tokens.after.len(), + }; + lines_counter + .fetch_add(nl, Ordering::SeqCst); + out.push(FileChange { + relpath: change.location.to_owned(), + mode: FileMode::Modified, + source_relpath: None, + lines: Some(lines), + }); + } + Operation::ExternalCommand { .. } => { + unreachable!("disabled above") + } + Operation::SourceOrDestinationIsBinary => {} + } + } } } }, diff --git a/gix-archive/tests/archive.rs b/gix-archive/tests/archive.rs index 95c8e09359e..c9a4e8bb281 100644 --- a/gix-archive/tests/archive.rs +++ b/gix-archive/tests/archive.rs @@ -303,6 +303,6 @@ mod from_tree { } fn noop_pipeline() -> gix_filter::Pipeline { - gix_filter::Pipeline::new(&Default::default(), Default::default(), Default::default()) + gix_filter::Pipeline::new(Default::default(), Default::default()) } } diff --git a/gix-attributes/src/search/outcome.rs b/gix-attributes/src/search/outcome.rs index 7cea3bd1970..41d70e5806c 100644 --- a/gix-attributes/src/search/outcome.rs +++ b/gix-attributes/src/search/outcome.rs @@ -54,8 +54,6 @@ impl Outcome { collection: &MetadataCollection, attribute_names: &mut dyn Iterator>, ) { - self.initialize(collection); - self.selected.clear(); self.selected.extend(attribute_names.map(|name| { ( @@ -63,6 +61,8 @@ impl Outcome { collection.name_to_meta.get(name.as_str()).map(|meta| meta.id), ) })); + + self.initialize(collection); self.reset_remaining(); } diff --git a/gix-diff/Cargo.toml b/gix-diff/Cargo.toml index 8889f53690b..715f5f96cca 100644 --- a/gix-diff/Cargo.toml +++ b/gix-diff/Cargo.toml @@ -13,7 +13,7 @@ autotests = false [features] default = ["blob"] ## Enable diffing of blobs using imara-diff, which also allows for a generic rewrite tracking implementation. -blob = ["dep:imara-diff"] +blob = ["dep:imara-diff", "dep:gix-filter", "dep:gix-worktree", "dep:gix-path", "dep:gix-fs", "dep:gix-command", "dep:gix-tempfile", "dep:gix-trace"] ## Data structures implement `serde::Serialize` and `serde::Deserialize`. serde = ["dep:serde", "gix-hash/serde", "gix-object/serde"] ## Make it possible to compile to the `wasm32-unknown-unknown` target. @@ -25,6 +25,13 @@ doctest = false [dependencies] gix-hash = { version = "^0.13.1", path = "../gix-hash" } gix-object = { version = "^0.38.0", path = "../gix-object" } +gix-filter = { version = "^0.6.0", path = "../gix-filter", optional = true } +gix-worktree = { version = "^0.27.0", path = "../gix-worktree", default-features = false, features = ["attributes"], optional = true } +gix-command = { version = "^0.2.10", path = "../gix-command", optional = true } +gix-path = { version = "^0.10.0", path = "../gix-path", optional = true } +gix-fs = { version = "^0.8.0", path = "../gix-fs", optional = true } +gix-tempfile = { version = "11.0.0", optional = true } +gix-trace = { version = "^0.1.3", path = "../gix-trace", optional = true } thiserror = "1.0.32" imara-diff = { version = "0.1.3", optional = true } diff --git a/gix-diff/src/blob.rs b/gix-diff/src/blob.rs deleted file mode 100644 index 7b2a082bd1e..00000000000 --- a/gix-diff/src/blob.rs +++ /dev/null @@ -1,18 +0,0 @@ -//! For using text diffs, please have a look at the [`imara-diff` documentation](https://docs.rs/imara-diff), -//! maintained by [Pascal Kuthe](https://github.com/pascalkuthe). -//! -//! -/// Information about the diff performed to detect similarity. -#[derive(Debug, Default, Clone, Copy, Eq, PartialEq)] -pub struct DiffLineStats { - /// The amount of lines to remove from the source to get to the destination. - pub removals: u32, - /// The amount of lines to add to the source to get to the destination. - pub insertions: u32, - /// The amount of lines of the previous state, in the source. - pub before: u32, - /// The amount of lines of the new state, in the destination. - pub after: u32, -} - -pub use imara_diff::*; diff --git a/gix-diff/src/blob/mod.rs b/gix-diff/src/blob/mod.rs new file mode 100644 index 00000000000..2c2453318d0 --- /dev/null +++ b/gix-diff/src/blob/mod.rs @@ -0,0 +1,134 @@ +//! For using text diffs, please have a look at the [`imara-diff` documentation](https://docs.rs/imara-diff), +//! maintained by [Pascal Kuthe](https://github.com/pascalkuthe). +pub use imara_diff::*; +use std::collections::HashMap; +use std::path::PathBuf; + +use bstr::BString; + +/// +pub mod pipeline; + +/// +pub mod platform; + +/// Information about the diff performed to detect similarity. +#[derive(Debug, Default, Clone, Copy, PartialEq, PartialOrd)] +pub struct DiffLineStats { + /// The amount of lines to remove from the source to get to the destination. + pub removals: u32, + /// The amount of lines to add to the source to get to the destination. + pub insertions: u32, + /// The amount of lines of the previous state, in the source. + pub before: u32, + /// The amount of lines of the new state, in the destination. + pub after: u32, + /// A range from 0 to 1.0, where 1.0 is a perfect match and 0.5 is a similarity of 50%. + /// Similarity is the ratio between all lines in the previous blob and the current blob, + /// calculated as `(old_lines_count - new_lines_count) as f32 / old_lines_count.max(new_lines_count) as f32`. + pub similarity: f32, +} + +/// A way to classify a resource suitable for diffing. +#[derive(Copy, Clone, Debug, Ord, PartialOrd, Eq, PartialEq, Hash)] +pub enum ResourceKind { + /// The source of a rewrite, rename or copy operation, or generally the old version of a resource. + OldOrSource, + /// The destination of a rewrite, rename or copy operation, or generally the new version of a resource. + NewOrDestination, +} + +/// A set of values to define how to diff something that is associated with it using `git-attributes`, relevant for regular files. +/// +/// Some values are related to diffing, some are related to conversions. +#[derive(Default, Debug, Clone, PartialEq, Eq)] +pub struct Driver { + /// The name of the driver, as referred to by `[diff "name"]` in the git configuration. + pub name: BString, + /// The command to execute to perform the diff entirely like ` old-file old-hex old-mode new-file new-hex new-mode`. + /// + /// Please note that we don't make this call ourselves, but use it to determine that we should not run the our standard + /// built-in algorithm but bail instead as the output of such a program isn't standardized. + pub command: Option, + /// The per-driver algorithm to use. + pub algorithm: Option, + /// The external filter program to call like ` /path/to/blob` which outputs a textual version of the provided + /// binary file. + /// Note that it's invoked with a shell if arguments are given. + /// Further, if present, it will always be executed, whether `is_binary` is set or not. + pub binary_to_text_command: Option, + /// `Some(true)` if this driver deals with binary files, which means that a `binary_to_text_command` should be used to convert binary + /// into a textual representation. + /// Without such a command, anything that is considered binary is not diffed, but only the size of its data is made available. + /// If `Some(false)`, it won't be considered binary, and the its data will not be sampled for the null-byte either. + /// Leaving it to `None` means binary detection is automatic, and is based on the presence of the `0` byte in the first 8kB of the buffer. + pub is_binary: Option, +} + +/// A conversion pipeline to take an object or path from what's stored in `git` to what can be diffed, while +/// following the guidance of git-attributes at the respective path to learn if diffing should happen or if +/// the content is considered binary. +/// +/// There are two different conversion flows, where the target of the flow is a buffer with diffable content: +// TODO: update this with information about possible directions. +/// +/// * `worktree on disk` -> `text conversion` +/// * `object` -> `worktree-filters` -> `text conversion` +#[derive(Clone)] +pub struct Pipeline { + /// A way to read data directly from the worktree. + pub roots: pipeline::WorktreeRoots, + /// A pipeline to convert objects from what's stored in `git` to its worktree version. + pub worktree_filter: gix_filter::Pipeline, + /// Options affecting the way we read files. + pub options: pipeline::Options, + /// Drivers to help customize the conversion behaviour depending on the location of items. + drivers: Vec, + /// Pre-configured attributes to obtain additional diff-related information. + attrs: gix_filter::attributes::search::Outcome, + /// A buffer to manipulate paths + path: PathBuf, +} + +/// A utility for performing a diff of two blobs, including flexible conversions, conversion-caching +/// acquisition of diff information. +/// Note that this instance will not call external filters as their output can't be known programmatically, +/// but it allows to prepare their input if the caller wishes to perform this task. +/// +/// Optimized for NxM lookups with built-in caching. +#[derive(Clone)] +pub struct Platform { + /// The old version of a diff-able blob, if set. + old: Option, + /// The new version of a diff-able blob, if set. + new: Option, + + /// Options to alter how diffs should be performed. + pub options: platform::Options, + /// A way to convert objects into a diff-able format. + pub filter: Pipeline, + /// A way to access .gitattributes + pub attr_stack: gix_worktree::Stack, + + /// The way we convert resources into diffable states. + filter_mode: pipeline::Mode, + /// A continuously growing cache keeping ready-for-diff blobs by their path in the worktree, + /// as that is what affects their final diff-able state. + /// + /// That way, expensive rewrite-checks with NxM matrix checks would be as fast as possible, + /// avoiding duplicate work. + diff_cache: HashMap, +} + +mod impls { + use crate::blob::ResourceKind; + + impl std::fmt::Display for ResourceKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(match self { + ResourceKind::OldOrSource => "old", + ResourceKind::NewOrDestination => "new", + }) + } + } +} diff --git a/gix-diff/src/blob/pipeline.rs b/gix-diff/src/blob/pipeline.rs new file mode 100644 index 00000000000..d0899846d14 --- /dev/null +++ b/gix-diff/src/blob/pipeline.rs @@ -0,0 +1,518 @@ +use crate::blob::{Driver, Pipeline, ResourceKind}; +use bstr::{BStr, ByteSlice}; +use gix_filter::driver::apply::{Delay, MaybeDelayed}; +use gix_filter::pipeline::convert::{ToGitOutcome, ToWorktreeOutcome}; +use gix_object::tree::EntryKind; +use std::io::{Read, Write}; +use std::path::{Path, PathBuf}; +use std::process::{Command, Stdio}; + +/// A way to access roots for different kinds of resources that are possibly located and accessible in a worktree. +#[derive(Clone, Debug, Default)] +pub struct WorktreeRoots { + /// A place where the source of a rewrite, rename or copy, or generally the previous version of resources, are located. + pub old_root: Option, + /// A place where the destination of a rewrite, rename or copy, or generally the new version of resources, are located. + pub new_root: Option, +} + +impl WorktreeRoots { + /// Return the root path for the given `kind` + pub fn by_kind(&self, kind: ResourceKind) -> Option<&Path> { + match kind { + ResourceKind::OldOrSource => self.old_root.as_deref(), + ResourceKind::NewOrDestination => self.new_root.as_deref(), + } + } +} + +/// Data as part of an [Outcome]. +#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Debug)] +pub enum Data { + /// The data to use for diffing was written into the buffer that was passed during the call to [`Pipeline::convert_to_diffable()`]. + Buffer, + /// The size that the binary blob had at the given revision, without having applied filters, as it's either + /// considered binary or above the big-file threshold. + /// + /// In this state, the binary file cannot be diffed. + Binary { + /// The size of the object prior to performing any filtering or as it was found on disk. + /// + /// Note that technically, the size isn't always representative of the same 'state' of the + /// content, as once it can be the size of the blob in git, and once it's the size of file + /// in the worktree. + size: u64, + }, +} + +/// The outcome returned by [Pipeline::convert_to_diffable()](super::Pipeline::convert_to_diffable()). +#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Debug)] +pub struct Outcome { + /// If available, an index into the `drivers` field to access more diff-related information of the driver for items + /// at the given path, as previously determined by git-attributes. + /// + /// Note that drivers are queried even if there is no object available. + pub driver_index: Option, + /// The data itself, suitable for diffing, and if the object or worktree item is present at all. + pub data: Option, +} + +/// Options for use in a [`Pipeline`]. +#[derive(Default, Clone, Copy, PartialEq, Eq, Debug, Hash, Ord, PartialOrd)] +pub struct Options { + /// The amount of bytes that an object has to reach before being treated as binary. + /// These objects will not be queried, nor will their data be processed in any way. + /// If `0`, no file is ever considered binary due to their size. + /// + /// Note that for files stored in `git`, what counts is their stored, decompressed size, + /// thus `git-lfs` files would typically not be considered binary unless one explicitly sets + /// them + pub large_file_threshold_bytes: u64, + /// Capabilities of the file system which affect how we read worktree files. + pub fs: gix_fs::Capabilities, +} + +/// The specific way to convert a resource. +#[derive(Default, Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] +pub enum Mode { + /// Always prepare the version of the resource as it would be in the work-tree, and + /// apply binary-to-text filters if present. + /// + /// This is typically free for resources in the worktree, and will apply filters to resources in the + /// object database. + #[default] + ToWorktreeAndBinaryToText, + /// Prepare the version of the resource as it would be in the work-tree if + /// binary-to-text filters are present (and apply them), or use the version in `git` otherwise. + ToGitUnlessBinaryToTextIsPresent, + /// Always prepare resources as they are stored in `git`. + /// + /// This is usually fastest, even though resources in the worktree needed to be converted files. + ToGit, +} + +impl Mode { + fn to_worktree(self) -> bool { + matches!( + self, + Mode::ToGitUnlessBinaryToTextIsPresent | Mode::ToWorktreeAndBinaryToText + ) + } + + fn to_git(self) -> bool { + matches!(self, Mode::ToGitUnlessBinaryToTextIsPresent | Mode::ToGit) + } +} + +/// +pub mod convert_to_diffable { + use bstr::BString; + use gix_object::tree::EntryKind; + + /// The error returned by [Pipeline::convert_to_diffable()](super::Pipeline::convert_to_diffable()). + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error("Entry at '{rela_path}' must be regular file or symlink, but was {actual:?}")] + InvalidEntryKind { rela_path: BString, actual: EntryKind }, + #[error("Entry at '{rela_path}' could not be read as symbolic link")] + ReadLink { rela_path: BString, source: std::io::Error }, + #[error("Entry at '{rela_path}' could not be opened for reading or read from")] + OpenOrRead { rela_path: BString, source: std::io::Error }, + #[error("Entry at '{rela_path}' could not be copied from a filter process to a memory buffer")] + StreamCopy { rela_path: BString, source: std::io::Error }, + #[error("Failed to run '{cmd}' for binary-to-text conversion of entry at {rela_path}")] + RunTextConvFilter { + rela_path: BString, + cmd: String, + source: std::io::Error, + }, + #[error("Tempfile for binary-to-text conversion for entry at {rela_path} could not be created")] + CreateTempfile { rela_path: BString, source: std::io::Error }, + #[error("Binary-to-text conversion '{cmd}' for entry at {rela_path} failed with: {stderr}")] + TextConvFilterFailed { + rela_path: BString, + cmd: String, + stderr: BString, + }, + #[error(transparent)] + FindObject(#[from] gix_object::find::existing_object::Error), + #[error(transparent)] + ConvertToWorktree(#[from] gix_filter::pipeline::convert::to_worktree::Error), + #[error(transparent)] + ConvertToGit(#[from] gix_filter::pipeline::convert::to_git::Error), + } +} + +/// Lifecycle +impl Pipeline { + /// Create a new instance of a pipeline which produces blobs suitable for diffing. `roots` allow to read worktree files directly, otherwise + /// `worktree_filter` is used to transform object database data directly. `drivers` further configure individual paths. + /// `options` are used to further configure the way we act.. + pub fn new( + roots: WorktreeRoots, + worktree_filter: gix_filter::Pipeline, + mut drivers: Vec, + options: Options, + ) -> Self { + drivers.sort_by(|a, b| a.name.cmp(&b.name)); + Pipeline { + roots, + worktree_filter, + drivers, + options, + attrs: { + let mut out = gix_filter::attributes::search::Outcome::default(); + out.initialize_with_selection(&Default::default(), Some("diff")); + out + }, + path: Default::default(), + } + } +} + +/// Access +impl Pipeline { + /// Return all drivers that this instance was initialized with. + pub fn drivers(&self) -> &[super::Driver] { + &self.drivers + } +} + +/// Conversion +impl Pipeline { + /// Convert the object at `id`, `mode`, `rela_path` and `kind`, providing access to `attributes` and `objects`. + /// The resulting diff-able data is written into `out`, assuming it's not too large. The returned [`Outcome`] + /// contains information on how to use `out`, or if it's filled at all. + /// + /// `attributes` must be returning the attributes at `rela_path`, and `objects` must be usable if `kind` is + /// a resource in the object database, i.e. has no worktree root available. + /// + /// If `id` [is null](gix_hash::ObjectId::is_null()) or the file in question doesn't exist in the worktree in case + /// [a root](WorktreeRoots) is present, then `out` will be left cleared and [Outcome::data] will be `None`. + /// + /// Note that `mode` is trusted, and we will not re-validate that the entry in the worktree actually is of that mode. + /// + /// Use `convert` to control what kind of the resource will be produced. + /// + /// ### About Tempfiles + /// + /// When querying from the object database and a binary and a [binary-to-text](Driver::binary_to_text_command) is set, + /// a temporary file will be created to serve as input for the converter program, containing the worktree-data that + /// exactly as it would be present in the worktree if checked out. + /// + /// As these files are ultimately named tempfiles, they will be leaked unless the [gix_tempfile] is configured with + /// a signal handler. If they leak, they would remain in the system's `$TMP` directory. + #[allow(clippy::too_many_arguments)] + pub fn convert_to_diffable( + &mut self, + id: &gix_hash::oid, + mode: EntryKind, + rela_path: &BStr, + kind: ResourceKind, + attributes: &mut dyn FnMut(&BStr, &mut gix_filter::attributes::search::Outcome), + objects: &dyn gix_object::FindObjectOrHeader, + convert: Mode, + out: &mut Vec, + ) -> Result { + let is_symlink = match mode { + EntryKind::Link if self.options.fs.symlink => true, + EntryKind::Blob | EntryKind::BlobExecutable => false, + _ => { + return Err(convert_to_diffable::Error::InvalidEntryKind { + rela_path: rela_path.to_owned(), + actual: mode, + }) + } + }; + + out.clear(); + attributes(rela_path, &mut self.attrs); + let attr = self.attrs.iter_selected().next().expect("pre-initialized with 'diff'"); + let driver_index = attr + .assignment + .state + .as_bstr() + .and_then(|name| self.drivers.binary_search_by(|d| d.name.as_bstr().cmp(name)).ok()); + let driver = driver_index.map(|idx| &self.drivers[idx]); + let mut is_binary = if let Some(driver) = driver { + driver + .is_binary + .map(|is_binary| is_binary && driver.binary_to_text_command.is_none()) + } else { + attr.assignment.state.is_unset().then_some(true) + }; + match self.roots.by_kind(kind) { + Some(root) => { + self.path.clear(); + self.path.push(root); + self.path.push(gix_path::from_bstr(rela_path)); + let data = if is_symlink { + let target = none_if_missing(std::fs::read_link(&self.path)).map_err(|err| { + convert_to_diffable::Error::ReadLink { + rela_path: rela_path.to_owned(), + source: err, + } + })?; + target.map(|target| { + out.extend_from_slice(gix_path::into_bstr(target).as_ref()); + Data::Buffer + }) + } else { + let need_size_only = is_binary == Some(true); + let size_in_bytes = (need_size_only + || (is_binary != Some(false) && self.options.large_file_threshold_bytes > 0)) + .then(|| { + none_if_missing(self.path.metadata().map(|md| md.len())).map_err(|err| { + convert_to_diffable::Error::OpenOrRead { + rela_path: rela_path.to_owned(), + source: err, + } + }) + }) + .transpose()?; + match size_in_bytes { + Some(None) => None, // missing as identified by the size check + Some(Some(size)) if size > self.options.large_file_threshold_bytes || need_size_only => { + Some(Data::Binary { size }) + } + _ => { + match driver + .filter(|_| convert.to_worktree()) + .and_then(|d| d.prepare_binary_to_text_cmd(&self.path)) + { + Some(cmd) => { + // Avoid letting the driver program fail if it doesn't exist. + if self.options.large_file_threshold_bytes == 0 + && none_if_missing(std::fs::symlink_metadata(&self.path)) + .map_err(|err| convert_to_diffable::Error::OpenOrRead { + rela_path: rela_path.to_owned(), + source: err, + })? + .is_none() + { + None + } else { + run_cmd(rela_path, cmd, out)?; + Some(Data::Buffer) + } + } + None => { + let file = none_if_missing(std::fs::File::open(&self.path)).map_err(|err| { + convert_to_diffable::Error::OpenOrRead { + rela_path: rela_path.to_owned(), + source: err, + } + })?; + + match file { + Some(mut file) => { + if convert.to_git() { + let res = self.worktree_filter.convert_to_git( + file, + gix_path::from_bstr(rela_path).as_ref(), + attributes, + &mut |buf| objects.try_find(id, buf).map(|obj| obj.map(|_| ())), + )?; + + match res { + ToGitOutcome::Unchanged(mut file) => { + file.read_to_end(out).map_err(|err| { + convert_to_diffable::Error::OpenOrRead { + rela_path: rela_path.to_owned(), + source: err, + } + })?; + } + ToGitOutcome::Process(mut stream) => { + stream.read_to_end(out).map_err(|err| { + convert_to_diffable::Error::OpenOrRead { + rela_path: rela_path.to_owned(), + source: err, + } + })?; + } + ToGitOutcome::Buffer(buf) => { + out.resize(buf.len(), 0); + out.copy_from_slice(buf); + } + } + } else { + file.read_to_end(out).map_err(|err| { + convert_to_diffable::Error::OpenOrRead { + rela_path: rela_path.to_owned(), + source: err, + } + })?; + } + + Some(if is_binary.unwrap_or_else(|| is_binary_buf(out)) { + let size = out.len() as u64; + out.clear(); + Data::Binary { size } + } else { + Data::Buffer + }) + } + None => None, + } + } + } + } + } + }; + Ok(Outcome { driver_index, data }) + } + None => { + let data = if id.is_null() { + None + } else { + let header = objects + .try_header(id) + .map_err(gix_object::find::existing_object::Error::Find)? + .ok_or_else(|| gix_object::find::existing_object::Error::NotFound { oid: id.to_owned() })?; + if is_binary.is_none() && self.options.large_file_threshold_bytes > 0 { + is_binary = Some(header.size > self.options.large_file_threshold_bytes); + }; + let data = if is_binary == Some(true) { + Data::Binary { size: header.size } + } else { + objects + .try_find(id, out) + .map_err(gix_object::find::existing_object::Error::Find)? + .ok_or_else(|| gix_object::find::existing_object::Error::NotFound { oid: id.to_owned() })?; + if matches!(mode, EntryKind::Blob | EntryKind::BlobExecutable) + && convert == Mode::ToWorktreeAndBinaryToText + || (convert == Mode::ToGitUnlessBinaryToTextIsPresent + && driver.map_or(false, |d| d.binary_to_text_command.is_some())) + { + let res = + self.worktree_filter + .convert_to_worktree(out, rela_path, attributes, Delay::Forbid)?; + + let cmd_and_file = driver + .and_then(|d| { + d.binary_to_text_command.is_some().then(|| { + gix_tempfile::new( + std::env::temp_dir(), + gix_tempfile::ContainingDirectory::Exists, + gix_tempfile::AutoRemove::Tempfile, + ) + .and_then(|mut tmp_file| { + self.path.clear(); + tmp_file.with_mut(|tmp| self.path.push(tmp.path()))?; + Ok(tmp_file) + }) + .map(|tmp_file| { + ( + d.prepare_binary_to_text_cmd(&self.path) + .expect("always get cmd if command is set"), + tmp_file, + ) + }) + }) + }) + .transpose() + .map_err(|err| convert_to_diffable::Error::CreateTempfile { + source: err, + rela_path: rela_path.to_owned(), + })?; + match cmd_and_file { + Some((cmd, mut tmp_file)) => { + tmp_file.write_all(out).map_err(|err| { + convert_to_diffable::Error::CreateTempfile { + source: err, + rela_path: rela_path.to_owned(), + } + })?; + out.clear(); + run_cmd(rela_path, cmd, out)?; + } + None => { + match res { + ToWorktreeOutcome::Unchanged(_) => {} + ToWorktreeOutcome::Buffer(src) => { + out.resize(src.len(), 0); + out.copy_from_slice(src); + } + ToWorktreeOutcome::Process(MaybeDelayed::Immediate(mut stream)) => { + std::io::copy(&mut stream, out).map_err(|err| { + convert_to_diffable::Error::StreamCopy { + rela_path: rela_path.to_owned(), + source: err, + } + })?; + } + ToWorktreeOutcome::Process(MaybeDelayed::Delayed(_)) => { + unreachable!("we prohibit this") + } + }; + } + } + } + + if driver.map_or(true, |d| d.binary_to_text_command.is_none()) + && is_binary.unwrap_or_else(|| is_binary_buf(out)) + { + let size = out.len() as u64; + out.clear(); + Data::Binary { size } + } else { + Data::Buffer + } + }; + Some(data) + }; + Ok(Outcome { driver_index, data }) + } + } + } +} + +fn is_binary_buf(buf: &[u8]) -> bool { + let buf = &buf[..buf.len().min(8000)]; + buf.contains(&0) +} + +fn none_if_missing(res: std::io::Result) -> std::io::Result> { + match res { + Ok(data) => Ok(Some(data)), + Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(None), + Err(err) => Err(err), + } +} + +fn run_cmd(rela_path: &BStr, mut cmd: Command, out: &mut Vec) -> Result<(), convert_to_diffable::Error> { + gix_trace::debug!(cmd = ?cmd, "Running binary-to-text command"); + let mut res = cmd + .output() + .map_err(|err| convert_to_diffable::Error::RunTextConvFilter { + rela_path: rela_path.to_owned(), + cmd: format!("{cmd:?}"), + source: err, + })?; + if !res.status.success() { + return Err(convert_to_diffable::Error::TextConvFilterFailed { + rela_path: rela_path.to_owned(), + cmd: format!("{cmd:?}"), + stderr: res.stderr.into(), + }); + } + out.append(&mut res.stdout); + Ok(()) +} + +impl Driver { + /// Produce an invocable command pre-configured to produce the filtered output on stdout after reading `path`. + pub fn prepare_binary_to_text_cmd(&self, path: &Path) -> Option { + let command: &BStr = self.binary_to_text_command.as_ref()?.as_ref(); + let cmd = gix_command::prepare(gix_path::from_bstr(command).into_owned()) + .with_shell() + .stdin(Stdio::null()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .arg(path) + .into(); + Some(cmd) + } +} diff --git a/gix-diff/src/blob/platform.rs b/gix-diff/src/blob/platform.rs new file mode 100644 index 00000000000..143317ba04e --- /dev/null +++ b/gix-diff/src/blob/platform.rs @@ -0,0 +1,613 @@ +use super::Algorithm; +use crate::blob::{pipeline, Pipeline, Platform, ResourceKind}; +use bstr::{BStr, BString, ByteSlice}; +use std::io::Write; +use std::process::Stdio; + +/// A key to uniquely identify either a location in the worktree, or in the object database. +#[derive(Clone)] +pub(crate) struct CacheKey { + id: gix_hash::ObjectId, + location: BString, + /// If `true`, this is an `id` based key, otherwise it's location based. + use_id: bool, + /// Only relevant when `id` is not null, to further differentiate content and allow us to + /// keep track of both links and blobs with the same content (rare, but possible). + is_link: bool, +} + +/// A stored value representing a diffable resource. +#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Debug)] +pub(crate) struct CacheValue { + /// The outcome of converting a resource into a diffable format using [Pipeline::convert_to_diffable()]. + conversion: pipeline::Outcome, + /// The kind of the resource we are looking at. Only possible values are `Blob`, `BlobExecutable` and `Link`. + mode: gix_object::tree::EntryKind, + /// A possibly empty buffer, depending on `conversion.data` which may indicate the data is considered binary. + buffer: Vec, +} + +impl std::hash::Hash for CacheKey { + fn hash(&self, state: &mut H) { + if self.use_id { + self.id.hash(state); + self.is_link.hash(state) + } else { + self.location.hash(state) + } + } +} + +impl PartialEq for CacheKey { + fn eq(&self, other: &Self) -> bool { + match (self.use_id, other.use_id) { + (false, false) => self.location.eq(&other.location), + (true, true) => self.id.eq(&other.id) && self.is_link.eq(&other.is_link), + _ => false, + } + } +} + +impl Eq for CacheKey {} + +impl Default for CacheKey { + fn default() -> Self { + CacheKey { + id: gix_hash::Kind::Sha1.null(), + use_id: false, + is_link: false, + location: BString::default(), + } + } +} + +impl CacheKey { + fn set_location(&mut self, rela_path: &BStr) { + self.location.clear(); + self.location.extend_from_slice(rela_path); + } +} + +/// A resource ready to be diffed in one way or another. +#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] +pub struct Resource<'a> { + /// If available, an index into the `drivers` field to access more diff-related information of the driver for items + /// at the given path, as previously determined by git-attributes. + /// + /// Note that drivers are queried even if there is no object available. + pub driver_index: Option, + /// The data itself, suitable for diffing, and if the object or worktree item is present at all. + pub data: resource::Data<'a>, + /// The kind of the resource we are looking at. Only possible values are `Blob`, `BlobExecutable` and `Link`. + pub mode: gix_object::tree::EntryKind, + /// The location of the resource, relative to the working tree. + pub rela_path: &'a BStr, + /// The id of the content as it would be stored in `git`, or `null` if the content doesn't exist anymore at + /// `rela_path` or if it was never computed. This can happen with content read from the worktree, which has to + /// go through a filter to be converted back to what `git` would store. + pub id: &'a gix_hash::oid, +} + +/// +pub mod resource { + use crate::blob::pipeline; + use crate::blob::platform::{CacheKey, CacheValue, Resource}; + + impl<'a> Resource<'a> { + pub(crate) fn new(key: &'a CacheKey, value: &'a CacheValue) -> Self { + Resource { + driver_index: value.conversion.driver_index, + data: value.conversion.data.map_or(Data::Missing, |data| match data { + pipeline::Data::Buffer => Data::Buffer(&value.buffer), + pipeline::Data::Binary { size } => Data::Binary { size }, + }), + mode: value.mode, + rela_path: key.location.as_ref(), + id: &key.id, + } + } + + /// Produce an iterator over lines, separated by LF or CRLF, suitable to create tokens using + /// [`imara_diff::intern::InternedInput`]. + pub fn intern_source(&self) -> imara_diff::sources::ByteLines<'a, true> { + crate::blob::sources::byte_lines_with_terminator(self.data.as_slice().unwrap_or_default()) + } + } + + /// The data of a diffable resource, as it could be determined and computed previously. + #[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] + pub enum Data<'a> { + /// The object is missing, either because it didn't exist in the working tree or because its `id` was null. + Missing, + /// The textual data as processed to be in a diffable state. + Buffer(&'a [u8]), + /// The size that the binary blob had at the given revision, without having applied filters, as it's either + /// considered binary or above the big-file threshold. + /// + /// In this state, the binary file cannot be diffed. + Binary { + /// The size of the object prior to performing any filtering or as it was found on disk. + /// + /// Note that technically, the size isn't always representative of the same 'state' of the + /// content, as once it can be the size of the blob in git, and once it's the size of file + /// in the worktree. + size: u64, + }, + } + + impl<'a> Data<'a> { + /// Return ourselves as slice of bytes if this instance stores data. + pub fn as_slice(&self) -> Option<&'a [u8]> { + match self { + Data::Buffer(d) => Some(d), + Data::Binary { .. } | Data::Missing => None, + } + } + } +} + +/// +pub mod set_resource { + use crate::blob::{pipeline, ResourceKind}; + use bstr::BString; + + /// The error returned by [Platform::set_resource](super::Platform::set_resource). + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error("Can only diff blobs and links, not {mode:?}")] + InvalidMode { mode: gix_object::tree::EntryKind }, + #[error("Failed to read {kind} worktree data from '{rela_path}'")] + Io { + rela_path: BString, + kind: ResourceKind, + source: std::io::Error, + }, + #[error("Failed to obtain attributes for {kind} resource at '{rela_path}'")] + Attributes { + rela_path: BString, + kind: ResourceKind, + source: std::io::Error, + }, + #[error(transparent)] + ConvertToDiffable(#[from] pipeline::convert_to_diffable::Error), + } +} + +/// +pub mod prepare_diff { + use crate::blob::platform::Resource; + use bstr::BStr; + + /// The kind of operation that was performed during the [`diff`](super::Platform::prepare_diff()) operation. + #[derive(Debug, Copy, Clone, Eq, PartialEq)] + pub enum Operation<'a> { + /// The [internal diff algorithm](imara_diff::diff) should be called with the provided arguments. + /// This only happens if none of the resources are binary, and if there is no external diff program configured via git-attributes + /// *or* [Options::skip_internal_diff_if_external_is_configured](super::Options::skip_internal_diff_if_external_is_configured) + /// is `false`. + /// + /// Use [`Outcome::interned_input()`] to easily obtain an interner for use with [`imara_diff::diff()`], or maintain one yourself + /// for greater re-use. + InternalDiff { + /// The algorithm we determined should be used, which is one of (in order, first set one wins): + /// + /// * the driver's override + /// * the platforms own configuration (typically from git-config) + /// * the default algorithm + algorithm: imara_diff::Algorithm, + }, + /// Run the external diff program according as configured in the `source`-resources driver. + /// This only happens if [Options::skip_internal_diff_if_external_is_configured](super::Options::skip_internal_diff_if_external_is_configured) + /// was `true`, preventing the usage of the internal diff implementation. + ExternalCommand { + /// The command as extracted from [Driver::command](super::super::Driver::command). + /// Use it in [`Platform::prepare_diff_command`](super::Platform::prepare_diff_command()) to easily prepare a compatible invocation. + command: &'a BStr, + }, + /// One of the involved resources, [`old`](Outcome::old) or [`new`](Outcome::new), were binary and thus no diff + /// cannot be performed. + SourceOrDestinationIsBinary, + } + + /// The outcome of a [`prepare_diff`](super::Platform::prepare_diff()) operation. + #[derive(Debug, Copy, Clone, Eq, PartialEq)] + pub struct Outcome<'a> { + /// The kind of diff that was actually performed. This may include skipping the internal diff as well. + pub operation: Operation<'a>, + /// The old or source of the diff operation. + pub old: Resource<'a>, + /// The new or destination of the diff operation. + pub new: Resource<'a>, + } + + impl<'a> Outcome<'a> { + /// Produce an instance of an interner which `git` would use to perform diffs. + pub fn interned_input(&self) -> imara_diff::intern::InternedInput<&'a [u8]> { + crate::blob::intern::InternedInput::new(self.old.intern_source(), self.new.intern_source()) + } + } + + /// The error returned by [Platform::prepare_diff()](super::Platform::prepare_diff()). + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error("Either the source or the destination of the diff operation were not set")] + SourceOrDestinationUnset, + #[error("Tried to diff resources that are both considered removed")] + SourceAndDestinationRemoved, + } +} + +/// +pub mod prepare_diff_command { + use bstr::BString; + use std::ops::{Deref, DerefMut}; + + /// The error returned by [Platform::prepare_diff_command()](super::Platform::prepare_diff_command()). + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error("Either the source or the destination of the diff operation were not set")] + SourceOrDestinationUnset, + #[error("Binary resources can't be diffed with an external command (as we don't have the data anymore)")] + SourceOrDestinationBinary, + #[error( + "Tempfile to store content of '{rela_path}' for passing to external diff command could not be created" + )] + CreateTempfile { rela_path: BString, source: std::io::Error }, + #[error("Could not write content of '{rela_path}' to tempfile for passing to external diff command")] + WriteTempfile { rela_path: BString, source: std::io::Error }, + } + + /// The outcome of a [`prepare_diff_command`](super::Platform::prepare_diff_command()) operation. + /// + /// This type acts like [`std::process::Command`], ready to run, with `stdin`, `stdout` and `stderr` set to *inherit* + /// all handles as this is expected to be for visual inspection. + pub struct Command { + pub(crate) cmd: std::process::Command, + /// Possibly a tempfile to be removed after the run, or `None` if there is no old version. + pub(crate) old: Option>, + /// Possibly a tempfile to be removed after the run, or `None` if there is no new version. + pub(crate) new: Option>, + } + + impl Deref for Command { + type Target = std::process::Command; + + fn deref(&self) -> &Self::Target { + &self.cmd + } + } + + impl DerefMut for Command { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.cmd + } + } +} + +/// Options for use in [Platform::new()]. +#[derive(Default, Copy, Clone)] +pub struct Options { + /// The algorithm to use when diffing. + /// If unset, it uses the [default algorithm](Algorithm::default()). + pub algorithm: Option, + /// If `true`, default `false`, then an external `diff` configured using gitattributes and drivers, + /// will cause the built-in diff [to be skipped](prepare_diff::Operation::ExternalCommand). + /// Otherwise, the internal diff is called despite the configured external diff, which is + /// typically what callers expect by default. + pub skip_internal_diff_if_external_is_configured: bool, +} + +/// Lifecycle +impl Platform { + /// Create a new instance with `options`, and a way to `filter` data from the object database to data that is diff-able. + /// `filter_mode` decides how to do that specifically. + /// Use `attr_stack` to access attributes pertaining worktree filters and diff settings. + pub fn new( + options: Options, + filter: Pipeline, + filter_mode: pipeline::Mode, + attr_stack: gix_worktree::Stack, + ) -> Self { + Platform { + old: None, + new: None, + diff_cache: Default::default(), + options, + filter, + filter_mode, + attr_stack, + } + } +} + +/// Conversions +impl Platform { + /// Store enough information about a resource to eventually diff it, where… + /// + /// * `id` is the hash of the resource. If it [is null](gix_hash::ObjectId::is_null()), it should either + /// be a resource in the worktree, or it's considered a non-existing, deleted object. + /// If an `id` is known, as the hash of the object as (would) be stored in `git`, then it should be provided + /// for completeness. + /// * `mode` is the kind of object (only blobs and links are allowed) + /// * `rela_path` is the relative path as seen from the (work)tree root. + /// * `kind` identifies the side of the diff this resource will be used for. + /// A diff needs both `OldOrSource` *and* `NewOrDestination`. + /// * `objects` provides access to the object database in case the resource can't be read from a worktree. + /// + /// Note that it's assumed that either `id + mode (` or `rela_path` can serve as unique identifier for the resource, + /// depending on whether or not a [worktree root](pipeline::WorktreeRoots) is set for the resource of `kind`, + /// with resources with worktree roots using the `rela_path` as unique identifier. + /// + /// ### Important + /// + /// If an error occours, the previous resource of `kind` will be cleared, preventing further diffs + /// unless another attempt succeeds. + pub fn set_resource( + &mut self, + id: gix_hash::ObjectId, + mode: gix_object::tree::EntryKind, + rela_path: &BStr, + kind: ResourceKind, + objects: &impl gix_object::FindObjectOrHeader, // TODO: make this `dyn` once https://github.com/rust-lang/rust/issues/65991 is stable, then also make tracker.rs `objects` dyn + ) -> Result<(), set_resource::Error> { + let res = self.set_resource_inner(id, mode, rela_path, kind, objects); + if res.is_err() { + *match kind { + ResourceKind::OldOrSource => &mut self.old, + ResourceKind::NewOrDestination => &mut self.new, + } = None; + } + res + } + + /// Given `diff_command` and `context`, typically obtained from git-configuration, and the currently set diff-resources, + /// prepare the invocation and temporary files needed to launch it according to protocol. + /// `count` / `total` are used for progress indication passed as environment variables `GIT_DIFF_PATH_(COUNTER|TOTAL)` + /// respectively (0-based), so the first path has `count=0` and `total=1` (assuming there is only one path). + /// Returns `None` if at least one resource is unset, see [`set_resource()`](Self::set_resource()). + /// + /// Please note that this is an expensive operation this will always create up to two temporary files to hold the data + /// for the old and new resources. + /// + /// ### Deviation + /// + /// If one of the resources is binary, the operation reports an error as such resources don't make their data available + /// which is required for the external diff to run. + pub fn prepare_diff_command( + &self, + diff_command: BString, + context: gix_command::Context, + count: usize, + total: usize, + ) -> Result { + fn add_resource( + cmd: &mut std::process::Command, + res: Resource<'_>, + ) -> Result>, prepare_diff_command::Error> { + let tmpfile = match res.data { + resource::Data::Missing => { + cmd.args(["/dev/null", ".", "."]); + None + } + resource::Data::Buffer(buf) => { + let mut tmp = gix_tempfile::new( + std::env::temp_dir(), + gix_tempfile::ContainingDirectory::Exists, + gix_tempfile::AutoRemove::Tempfile, + ) + .map_err(|err| prepare_diff_command::Error::CreateTempfile { + rela_path: res.rela_path.to_owned(), + source: err, + })?; + tmp.write_all(buf) + .map_err(|err| prepare_diff_command::Error::WriteTempfile { + rela_path: res.rela_path.to_owned(), + source: err, + })?; + tmp.with_mut(|f| { + cmd.arg(f.path()); + }) + .map_err(|err| prepare_diff_command::Error::WriteTempfile { + rela_path: res.rela_path.to_owned(), + source: err, + })?; + cmd.arg(res.id.to_string()).arg(res.mode.as_octal_str().to_string()); + let tmp = tmp.close().map_err(|err| prepare_diff_command::Error::WriteTempfile { + rela_path: res.rela_path.to_owned(), + source: err, + })?; + Some(tmp) + } + resource::Data::Binary { .. } => return Err(prepare_diff_command::Error::SourceOrDestinationBinary), + }; + Ok(tmpfile) + } + + let (old, new) = self + .resources() + .ok_or(prepare_diff_command::Error::SourceOrDestinationUnset)?; + let mut cmd: std::process::Command = gix_command::prepare(gix_path::from_bstring(diff_command)) + .with_context(context) + .env("GIT_DIFF_PATH_COUNTER", (count + 1).to_string()) + .env("GIT_DIFF_PATH_TOTAL", total.to_string()) + .stdin(Stdio::inherit()) + .stdout(Stdio::inherit()) + .stderr(Stdio::inherit()) + .into(); + + cmd.arg(gix_path::from_bstr(old.rela_path).into_owned()); + let mut out = prepare_diff_command::Command { + cmd, + old: None, + new: None, + }; + + out.old = add_resource(&mut out.cmd, old)?; + out.new = add_resource(&mut out.cmd, new)?; + + if old.rela_path != new.rela_path { + out.cmd.arg(gix_path::from_bstr(new.rela_path).into_owned()); + } + + Ok(out) + } + + /// Returns the resource of the given kind if it was set. + pub fn resource(&self, kind: ResourceKind) -> Option> { + let key = match kind { + ResourceKind::OldOrSource => self.old.as_ref(), + ResourceKind::NewOrDestination => self.new.as_ref(), + }?; + Resource::new(key, self.diff_cache.get(key)?).into() + } + + /// Obtain the two resources that were previously set as `(OldOrSource, NewOrDestination)`, if both are set and available. + /// + /// This is useful if one wishes to manually prepare the diff, maybe for invoking external programs, instead of relying on + /// [`Self::prepare_diff()`]. + pub fn resources(&self) -> Option<(Resource<'_>, Resource<'_>)> { + let key = &self.old.as_ref()?; + let value = self.diff_cache.get(key)?; + let old = Resource::new(key, value); + + let key = &self.new.as_ref()?; + let value = self.diff_cache.get(key)?; + let new = Resource::new(key, value); + Some((old, new)) + } + + /// Prepare a diff operation on the [previously set](Self::set_resource()) [old](ResourceKind::OldOrSource) and + /// [new](ResourceKind::NewOrDestination) resources. + /// + /// The returned outcome allows to easily perform diff operations, based on the [`prepare_diff::Outcome::operation`] field, + /// which hints at what should be done. + pub fn prepare_diff(&mut self) -> Result, prepare_diff::Error> { + let old_key = &self.old.as_ref().ok_or(prepare_diff::Error::SourceOrDestinationUnset)?; + let old = self + .diff_cache + .get(old_key) + .ok_or(prepare_diff::Error::SourceOrDestinationUnset)?; + let new_key = &self.new.as_ref().ok_or(prepare_diff::Error::SourceOrDestinationUnset)?; + let new = self + .diff_cache + .get(new_key) + .ok_or(prepare_diff::Error::SourceOrDestinationUnset)?; + let mut out = prepare_diff::Outcome { + operation: prepare_diff::Operation::SourceOrDestinationIsBinary, + old: Resource::new(old_key, old), + new: Resource::new(new_key, new), + }; + + match (old.conversion.data, new.conversion.data) { + (None, None) => return Err(prepare_diff::Error::SourceAndDestinationRemoved), + (Some(pipeline::Data::Binary { .. }), _) | (_, Some(pipeline::Data::Binary { .. })) => return Ok(out), + _either_missing_or_non_binary => { + if let Some(command) = old + .conversion + .driver_index + .and_then(|idx| self.filter.drivers[idx].command.as_deref()) + .filter(|_| self.options.skip_internal_diff_if_external_is_configured) + { + out.operation = prepare_diff::Operation::ExternalCommand { + command: command.as_bstr(), + }; + return Ok(out); + } + } + } + + out.operation = prepare_diff::Operation::InternalDiff { + algorithm: old + .conversion + .driver_index + .and_then(|idx| self.filter.drivers[idx].algorithm) + .or(self.options.algorithm) + .unwrap_or_default(), + }; + Ok(out) + } + + /// Every call to [set_resource()](Self::set_resource()) will keep the diffable data in memory, and that will never be cleared. + /// + /// Use this method to clear the cache, releasing memory. Note that this will also loose all information about resources + /// which means diffs would fail unless the resources are set again. + /// + /// Note that this also has to be called if the same resource is going to be diffed in different states, i.e. using different + /// `id`s, but the same `rela_path`. + pub fn clear_resource_cache(&mut self) { + self.old = None; + self.new = None; + self.diff_cache.clear(); + } +} + +impl Platform { + fn set_resource_inner( + &mut self, + id: gix_hash::ObjectId, + mode: gix_object::tree::EntryKind, + rela_path: &BStr, + kind: ResourceKind, + objects: &impl gix_object::FindObjectOrHeader, + ) -> Result<(), set_resource::Error> { + if matches!( + mode, + gix_object::tree::EntryKind::Commit | gix_object::tree::EntryKind::Tree + ) { + return Err(set_resource::Error::InvalidMode { mode }); + } + let storage = match kind { + ResourceKind::OldOrSource => &mut self.old, + ResourceKind::NewOrDestination => &mut self.new, + } + .get_or_insert_with(Default::default); + + storage.id = id; + storage.set_location(rela_path); + storage.is_link = matches!(mode, gix_object::tree::EntryKind::Link); + storage.use_id = self.filter.roots.by_kind(kind).is_none(); + + if self.diff_cache.contains_key(storage) { + return Ok(()); + } + let entry = self + .attr_stack + .at_entry(rela_path, Some(false), objects) + .map_err(|err| set_resource::Error::Attributes { + source: err, + kind, + rela_path: rela_path.to_owned(), + })?; + let mut buf = Vec::new(); + let out = self.filter.convert_to_diffable( + &id, + mode, + rela_path, + kind, + &mut |_, out| { + let _ = entry.matching_attributes(out); + }, + objects, + self.filter_mode, + &mut buf, + )?; + let key = storage.clone(); + assert!( + self.diff_cache + .insert( + key, + CacheValue { + conversion: out, + mode, + buffer: buf, + }, + ) + .is_none(), + "The key impl makes clashes impossible with our usage" + ); + Ok(()) + } +} diff --git a/gix-diff/src/lib.rs b/gix-diff/src/lib.rs index 1ad5b1478db..c055e45ad34 100644 --- a/gix-diff/src/lib.rs +++ b/gix-diff/src/lib.rs @@ -8,6 +8,14 @@ #![deny(missing_docs, rust_2018_idioms)] #![forbid(unsafe_code)] +/// Re-export for use in public API. +#[cfg(feature = "blob")] +pub use gix_command as command; + +/// Re-export for use in public API. +#[cfg(feature = "blob")] +pub use gix_object as object; + /// A structure to capture how to perform rename and copy tracking, used by the [rewrites::Tracker]. #[derive(Debug, Copy, Clone, PartialEq)] #[cfg(feature = "blob")] diff --git a/gix-diff/src/rewrites/mod.rs b/gix-diff/src/rewrites/mod.rs index 8af13165f6f..08d6f2cce53 100644 --- a/gix-diff/src/rewrites/mod.rs +++ b/gix-diff/src/rewrites/mod.rs @@ -10,14 +10,8 @@ pub struct Tracker { items: Vec>, /// A place to store all paths in to reduce amount of allocations. path_backing: Vec, - /// A buffer for use when fetching objects for similarity tests. - buf1: Vec, - /// Another buffer for use when fetching objects for similarity tests. - buf2: Vec, /// How to track copies and/or rewrites. rewrites: Rewrites, - /// The diff algorithm to use when checking for similarity. - diff_algo: crate::blob::Algorithm, } /// Determine in which set of files to search for copies. diff --git a/gix-diff/src/rewrites/tracker.rs b/gix-diff/src/rewrites/tracker.rs index 09d3c724608..fca315a253e 100644 --- a/gix-diff/src/rewrites/tracker.rs +++ b/gix-diff/src/rewrites/tracker.rs @@ -1,12 +1,18 @@ +//! ### Deviation +//! +//! Note that the algorithm implemented here is in many ways different from what `git` does. +//! +//! - it's less sophisticated and doesn't use any ranking of candidates. Instead, it picks the first possible match. +//! - the set used for copy-detection is probably smaller by default. use std::ops::Range; use gix_object::tree::{EntryKind, EntryMode}; -use crate::blob::DiffLineStats; +use crate::blob::platform::prepare_diff::Operation; +use crate::blob::{DiffLineStats, ResourceKind}; use crate::rewrites::{CopySource, Outcome}; use crate::{rewrites::Tracker, Rewrites}; use bstr::BStr; -use gix_object::FindExt; /// The kind of a change. #[derive(Debug, Copy, Clone, Ord, PartialOrd, PartialEq, Eq)] @@ -22,6 +28,9 @@ pub enum ChangeKind { /// A trait providing all functionality to abstract over the concept of a change, as seen by the [`Tracker`]. pub trait Change: Clone { /// Return the hash of this change for identification. + /// + /// Note that this is the id of the object as stored in `git`, i.e. it must have gone through workspace + /// conversions. fn id(&self) -> &gix_hash::oid; /// Return the kind of this change. fn kind(&self) -> ChangeKind; @@ -71,6 +80,7 @@ pub mod visit { use gix_object::tree::EntryMode; /// The source of a rewrite, rename or copy. + #[derive(Debug, Clone, PartialEq, PartialOrd)] pub struct Source<'a> { /// The kind of entry. pub entry_mode: EntryMode, @@ -85,7 +95,7 @@ pub mod visit { } /// Further identify the kind of [Source]. - #[derive(Debug, Copy, Clone, Eq, PartialEq)] + #[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] pub enum SourceKind { /// This is the source of an entry that was renamed, as `source` was renamed to `destination`. Rename, @@ -94,7 +104,8 @@ pub mod visit { } /// A change along with a location. - pub struct Destination<'a, T> { + #[derive(Clone)] + pub struct Destination<'a, T: Clone> { /// The change at the given `location`. pub change: T, /// The repository-relative location of this destination. @@ -112,21 +123,21 @@ pub mod emit { FindExistingBlob(#[from] gix_object::find::existing_object::Error), #[error("Could not obtain exhaustive item set to use as possible sources for copy detection")] GetItemsForExhaustiveCopyDetection(#[source] Box), + #[error(transparent)] + SetResource(#[from] crate::blob::platform::set_resource::Error), + #[error(transparent)] + PrepareDiff(#[from] crate::blob::platform::prepare_diff::Error), } } /// Lifecycle impl Tracker { - /// Create a new instance with `rewrites` configuration, and the `diff_algo` to use when performing - /// similarity checking. - pub fn new(rewrites: Rewrites, diff_algo: crate::blob::Algorithm) -> Self { + /// Create a new instance with `rewrites` configuration. + pub fn new(rewrites: Rewrites) -> Self { Tracker { items: vec![], path_backing: vec![], - buf1: Vec::new(), - buf2: Vec::new(), rewrites, - diff_algo, } } } @@ -158,14 +169,22 @@ impl Tracker { None } - /// Can only be called once effectively as it alters its own state. + /// Can only be called once effectively as it alters its own state to assure each item is only emitted once. /// /// `cb(destination, source)` is called for each item, either with `Some(source)` if it's /// the destination of a copy or rename, or with `None` for source if no relation to other - /// items in the tracked set exist. + /// items in the tracked set exist, which is like saying 'no rename or rewrite or copy' happened. /// /// `objects` is used to access blob data for similarity checks if required and is taken directly from the object database. - /// Worktree filters and diff conversions will be applied afterwards automatically. + /// Worktree filters and text conversions will be applied afterwards automatically. Note that object-caching *should not* + /// be enabled as caching is implemented by `diff_cache`, after all, the blob that's actually diffed is going + /// through conversion steps. + /// + /// `diff_cache` is a way to retain a cache of resources that are prepared for rapid diffing, and it also controls + /// the diff-algorithm (provided no user-algorithm is set). + /// Note that we control a few options of `diff_cache` to assure it will ignore external commands. + /// Note that we do not control how the `diff_cache` converts resources, it's left to the caller to decide + /// if it should look at what's stored in `git`, or in the working tree, along with all diff-specific conversions. /// /// `push_source_tree(push_fn: push(change, location))` is a function that is called when the entire tree of the source /// should be added as modifications by calling `push` repeatedly to use for perfect copy tracking. Note that `push` @@ -173,13 +192,16 @@ impl Tracker { pub fn emit( &mut self, mut cb: impl FnMut(visit::Destination<'_, T>, Option>) -> crate::tree::visit::Action, - objects: &dyn gix_object::Find, + diff_cache: &mut crate::blob::Platform, + objects: &impl gix_object::FindObjectOrHeader, mut push_source_tree: PushSourceTreeFn, ) -> Result where PushSourceTreeFn: FnMut(&mut dyn FnMut(T, &BStr)) -> Result<(), E>, E: std::error::Error + Send + Sync + 'static, { + diff_cache.options.skip_internal_diff_if_external_is_configured = false; + fn by_id_and_location(a: &Item, b: &Item) -> std::cmp::Ordering { a.change .id() @@ -192,16 +214,24 @@ impl Tracker { options: self.rewrites, ..Default::default() }; - out = self.match_pairs_of_kind( + self.match_pairs_of_kind( visit::SourceKind::Rename, &mut cb, self.rewrites.percentage, - out, + &mut out, + diff_cache, objects, )?; if let Some(copies) = self.rewrites.copies { - out = self.match_pairs_of_kind(visit::SourceKind::Copy, &mut cb, copies.percentage, out, objects)?; + self.match_pairs_of_kind( + visit::SourceKind::Copy, + &mut cb, + copies.percentage, + &mut out, + diff_cache, + objects, + )?; match copies.source { CopySource::FromSetOfModifiedFiles => {} @@ -217,8 +247,14 @@ impl Tracker { .map_err(|err| emit::Error::GetItemsForExhaustiveCopyDetection(Box::new(err)))?; self.items.sort_by(by_id_and_location); - out = - self.match_pairs_of_kind(visit::SourceKind::Copy, &mut cb, copies.percentage, out, objects)?; + self.match_pairs_of_kind( + visit::SourceKind::Copy, + &mut cb, + copies.percentage, + &mut out, + diff_cache, + objects, + )?; } } } @@ -247,36 +283,43 @@ impl Tracker { kind: visit::SourceKind, cb: &mut impl FnMut(visit::Destination<'_, T>, Option>) -> crate::tree::visit::Action, percentage: Option, - mut out: Outcome, - objects: &dyn gix_object::Find, - ) -> Result { + out: &mut Outcome, + diff_cache: &mut crate::blob::Platform, + objects: &impl gix_object::FindObjectOrHeader, + ) -> Result<(), emit::Error> { // we try to cheaply reduce the set of possibilities first, before possibly looking more exhaustively. let needs_second_pass = !needs_exact_match(percentage); - if self.match_pairs(cb, None /* by identity */, kind, &mut out, objects)? == crate::tree::visit::Action::Cancel + if self.match_pairs(cb, None /* by identity */, kind, out, diff_cache, objects)? + == crate::tree::visit::Action::Cancel { - return Ok(out); + return Ok(()); } if needs_second_pass { let is_limited = if self.rewrites.limit == 0 { false - } else if let Some(permutations) = permutations_over_limit(&self.items, self.rewrites.limit, kind) { - match kind { - visit::SourceKind::Rename => { - out.num_similarity_checks_skipped_for_rename_tracking_due_to_limit = permutations; - } - visit::SourceKind::Copy => { - out.num_similarity_checks_skipped_for_copy_tracking_due_to_limit = permutations; + } else { + let (num_src, num_dst) = + estimate_involved_items(self.items.iter().map(|item| (item.emitted, item.change.kind())), kind); + let permutations = num_src * num_dst; + if permutations > self.rewrites.limit { + match kind { + visit::SourceKind::Rename => { + out.num_similarity_checks_skipped_for_rename_tracking_due_to_limit = permutations; + } + visit::SourceKind::Copy => { + out.num_similarity_checks_skipped_for_copy_tracking_due_to_limit = permutations; + } } + true + } else { + false } - true - } else { - false }; if !is_limited { - self.match_pairs(cb, percentage, kind, &mut out, objects)?; + self.match_pairs(cb, percentage, kind, out, diff_cache, objects)?; } } - Ok(out) + Ok(()) } fn match_pairs( @@ -285,9 +328,9 @@ impl Tracker { percentage: Option, kind: visit::SourceKind, stats: &mut Outcome, - objects: &dyn gix_object::Find, + diff_cache: &mut crate::blob::Platform, + objects: &impl gix_object::FindObjectOrHeader, ) -> Result { - // TODO(perf): reuse object data and interner state and interned tokens, make these available to `find_match()` let mut dest_ofs = 0; while let Some((mut dest_idx, dest)) = self.items[dest_ofs..].iter().enumerate().find_map(|(idx, item)| { (!item.emitted && matches!(item.change.kind(), ChangeKind::Addition)).then_some((idx, item)) @@ -298,12 +341,12 @@ impl Tracker { &self.items, dest, dest_idx, - percentage.map(|p| (p, self.diff_algo)), + percentage, kind, stats, objects, - &mut self.buf1, - &mut self.buf2, + diff_cache, + &self.path_backing, )? .map(|(src_idx, src, diff)| { let (id, entry_mode) = src.change.id_and_entry_mode(); @@ -338,17 +381,23 @@ impl Tracker { } } -fn permutations_over_limit(items: &[Item], limit: usize, kind: visit::SourceKind) -> Option { - let (sources, destinations) = items - .iter() - .filter(|item| match kind { - visit::SourceKind::Rename => !item.emitted, +/// Returns the amount of viable sources and destinations for `items` as eligible for the given `kind` of operation. +fn estimate_involved_items( + items: impl IntoIterator, + kind: visit::SourceKind, +) -> (usize, usize) { + items + .into_iter() + .filter(|(emitted, _)| match kind { + visit::SourceKind::Rename => !*emitted, visit::SourceKind::Copy => true, }) - .fold((0, 0), |(mut src, mut dest), item| { - match item.change.kind() { + .fold((0, 0), |(mut src, mut dest), (emitted, change_kind)| { + match change_kind { ChangeKind::Addition => { - dest += 1; + if kind == visit::SourceKind::Rename || !emitted { + dest += 1; + } } ChangeKind::Deletion => { if kind == visit::SourceKind::Rename { @@ -362,9 +411,7 @@ fn permutations_over_limit(items: &[Item], limit: usize, kind: vis } } (src, dest) - }); - let permutations = sources * destinations; - (permutations > limit * limit).then_some(permutations) + }) } fn needs_exact_match(percentage: Option) -> bool { @@ -386,15 +433,15 @@ fn find_match<'a, T: Change>( items: &'a [Item], item: &Item, item_idx: usize, - percentage: Option<(f32, crate::blob::Algorithm)>, + percentage: Option, kind: visit::SourceKind, stats: &mut Outcome, - objects: &dyn gix_object::Find, - buf1: &mut Vec, - buf2: &mut Vec, + objects: &impl gix_object::FindObjectOrHeader, + diff_cache: &mut crate::blob::Platform, + path_backing: &[u8], ) -> Result>, emit::Error> { let (item_id, item_mode) = item.change.id_and_entry_mode(); - if needs_exact_match(percentage.map(|t| t.0)) || item_mode.is_link() { + if needs_exact_match(percentage) || item_mode.is_link() { let first_idx = items.partition_point(|a| a.change.id() < item_id); let range = match items.get(first_idx..).map(|items| { let end = items @@ -417,47 +464,76 @@ fn find_match<'a, T: Change>( return Ok(Some(src)); } } else { - let new = objects.find_blob(item_id, buf1)?; - let (percentage, algo) = percentage.expect("it's set to something below 1.0 and we assured this"); + let mut has_new = false; + let percentage = percentage.expect("it's set to something below 1.0 and we assured this"); debug_assert_eq!( item.change.entry_mode().kind(), EntryKind::Blob, "symlinks are matched exactly, and trees aren't used here" ); + for (can_idx, src) in items .iter() .enumerate() .filter(|(src_idx, src)| *src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode)) { - let old = objects.find_blob(src.change.id(), buf2)?; - // TODO: make sure we get attribute handling/worktree conversion and binary skips and filters right here. - let tokens = crate::blob::intern::InternedInput::new( - crate::blob::sources::byte_lines_with_terminator(old.data), - crate::blob::sources::byte_lines_with_terminator(new.data), - ); - let counts = crate::blob::diff( - algo, - &tokens, - crate::blob::sink::Counter::new(diff::Statistics { - removed_bytes: 0, - input: &tokens, - }), - ); - let similarity = (old.data.len() - counts.wrapped) as f32 / old.data.len().max(new.data.len()) as f32; + if !has_new { + diff_cache.set_resource( + item_id.to_owned(), + item_mode.kind(), + item.location(path_backing), + ResourceKind::NewOrDestination, + objects, + )?; + has_new = true; + } + let (src_id, src_mode) = src.change.id_and_entry_mode(); + diff_cache.set_resource( + src_id.to_owned(), + src_mode.kind(), + src.location(path_backing), + ResourceKind::OldOrSource, + objects, + )?; + let prep = diff_cache.prepare_diff()?; stats.num_similarity_checks += 1; - if similarity >= percentage { - return Ok(Some(( - can_idx, - src, - DiffLineStats { - removals: counts.removals, - insertions: counts.insertions, - before: tokens.before.len().try_into().expect("interner handles only u32"), - after: tokens.after.len().try_into().expect("interner handles only u32"), + match prep.operation { + Operation::InternalDiff { algorithm } => { + let tokens = + crate::blob::intern::InternedInput::new(prep.old.intern_source(), prep.new.intern_source()); + let counts = crate::blob::diff( + algorithm, + &tokens, + crate::blob::sink::Counter::new(diff::Statistics { + removed_bytes: 0, + input: &tokens, + }), + ); + let old_data_len = prep.old.data.as_slice().unwrap_or_default().len(); + let new_data_len = prep.new.data.as_slice().unwrap_or_default().len(); + let similarity = (old_data_len - counts.wrapped) as f32 / old_data_len.max(new_data_len) as f32; + if similarity >= percentage { + return Ok(Some(( + can_idx, + src, + DiffLineStats { + removals: counts.removals, + insertions: counts.insertions, + before: tokens.before.len().try_into().expect("interner handles only u32"), + after: tokens.after.len().try_into().expect("interner handles only u32"), + similarity, + } + .into(), + ))); } - .into(), - ))); - } + } + Operation::ExternalCommand { .. } => { + unreachable!("we have disabled this possibility with an option") + } + Operation::SourceOrDestinationIsBinary => { + // TODO: figure out if git does more here + } + }; } } Ok(None) @@ -486,3 +562,58 @@ mod diff { } } } + +#[cfg(test)] +mod estimate_involved_items { + use super::estimate_involved_items; + use crate::rewrites::tracker::visit::SourceKind; + use crate::rewrites::tracker::ChangeKind; + + #[test] + fn renames_count_unemitted_as_sources_and_destinations() { + let items = [ + (false, ChangeKind::Addition), + (true, ChangeKind::Deletion), + (true, ChangeKind::Deletion), + ]; + assert_eq!( + estimate_involved_items(items, SourceKind::Rename), + (0, 1), + "here we only have one eligible source, hence nothing to do" + ); + assert_eq!( + estimate_involved_items(items.into_iter().map(|t| (false, t.1)), SourceKind::Rename), + (2, 1), + "now we have more possibilities as renames count un-emitted deletions as source" + ); + } + + #[test] + fn copies_do_not_count_additions_as_sources() { + let items = [ + (false, ChangeKind::Addition), + (true, ChangeKind::Addition), + (true, ChangeKind::Deletion), + ]; + assert_eq!( + estimate_involved_items(items, SourceKind::Copy), + (0, 1), + "one addition as source, the other isn't counted as it's emitted, nor is it considered a copy-source.\ + deletions don't count" + ); + } + + #[test] + fn copies_count_modifications_as_sources() { + let items = [ + (false, ChangeKind::Addition), + (true, ChangeKind::Modification), + (false, ChangeKind::Modification), + ]; + assert_eq!( + estimate_involved_items(items, SourceKind::Copy), + (2, 1), + "any modifications is a valid source, emitted or not" + ); + } +} diff --git a/gix-diff/tests/Cargo.toml b/gix-diff/tests/Cargo.toml index 6d07360ae38..e63c3d5a320 100644 --- a/gix-diff/tests/Cargo.toml +++ b/gix-diff/tests/Cargo.toml @@ -17,7 +17,11 @@ path = "diff.rs" [dev-dependencies] gix-diff = { path = ".." } gix-hash = { path = "../../gix-hash" } +gix-fs = { path = "../../gix-fs" } +gix-worktree = { path = "../../gix-worktree" } gix-object = { path = "../../gix-object" } gix-odb = { path = "../../gix-odb" } +gix-filter = { path = "../../gix-filter" } gix-traverse = { path = "../../gix-traverse" } gix-testtools = { path = "../../tests/tools" } +shell-words = "1" diff --git a/gix-diff/tests/blob/mod.rs b/gix-diff/tests/blob/mod.rs index 33b38e69ef5..bb412d412cf 100644 --- a/gix-diff/tests/blob/mod.rs +++ b/gix-diff/tests/blob/mod.rs @@ -1,2 +1,2 @@ -#[test] -fn currently_there_is_no_api_surface_to_test_as_it_is_reexporting_imara_diff() {} +pub(crate) mod pipeline; +mod platform; diff --git a/gix-diff/tests/blob/pipeline.rs b/gix-diff/tests/blob/pipeline.rs new file mode 100644 index 00000000000..e7a8385c7a7 --- /dev/null +++ b/gix-diff/tests/blob/pipeline.rs @@ -0,0 +1,925 @@ +pub(crate) mod convert_to_diffable { + + use crate::util::ObjectDb; + use gix_diff::blob::pipeline::Options; + use gix_diff::blob::{pipeline, pipeline::WorktreeRoots, ResourceKind}; + use gix_filter::eol; + use gix_filter::eol::AutoCrlf; + use gix_object::bstr::ByteSlice; + use gix_object::tree::EntryKind; + use gix_worktree::stack::state::attributes; + + #[test] + fn simple() -> crate::Result { + for mode in [ + pipeline::Mode::ToWorktreeAndBinaryToText, + pipeline::Mode::ToGit, + pipeline::Mode::ToGitUnlessBinaryToTextIsPresent, + ] { + let tmp = gix_testtools::tempfile::TempDir::new()?; + let mut filter = gix_diff::blob::Pipeline::new( + WorktreeRoots { + old_root: Some(tmp.path().to_owned()), + new_root: None, + }, + gix_filter::Pipeline::default(), + vec![], + default_options(), + ); + + let does_not_matter = gix_hash::Kind::Sha1.null(); + let mut buf = Vec::new(); + let a_name = "a"; + let a_content = "a-content"; + std::fs::write(tmp.path().join(a_name), a_content.as_bytes())?; + let out = filter.convert_to_diffable( + &does_not_matter, + EntryKind::Blob, + a_name.into(), + ResourceKind::OldOrSource, + &mut |_, _| {}, + &gix_object::find::Never, + mode, + &mut buf, + )?; + assert!(out.driver_index.is_none(), "there was no driver"); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!(buf.as_bstr(), a_content, "there is no transformations configured"); + + let link_name = "link"; + gix_fs::symlink::create(a_name.as_ref(), &tmp.path().join(link_name))?; + let out = filter.convert_to_diffable( + &does_not_matter, + EntryKind::Link, + link_name.into(), + ResourceKind::OldOrSource, + &mut |_, _| {}, + &gix_object::find::Never, + mode, + &mut buf, + )?; + + assert!(out.driver_index.is_none()); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!( + buf.as_bstr(), + a_name, + "links are just files with a different mode, with its content pointing to the target" + ); + drop(tmp); + + let mut db = ObjectDb::default(); + let b_content = "b-content"; + let id = db.insert(b_content); + + let out = filter.convert_to_diffable( + &id, + EntryKind::Blob, + a_name.into(), + ResourceKind::NewOrDestination, + &mut |_, _| {}, + &db, + mode, + &mut buf, + )?; + + assert!(out.driver_index.is_none(), "there was no driver"); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!(buf.as_bstr(), b_content, "there is no transformations configured"); + } + + Ok(()) + } + + #[test] + fn above_large_file_threshold() -> crate::Result { + let tmp = gix_testtools::tempfile::TempDir::new()?; + let mut filter = gix_diff::blob::Pipeline::new( + WorktreeRoots { + old_root: None, + new_root: Some(tmp.path().to_owned()), + }, + gix_filter::Pipeline::default(), + vec![], + gix_diff::blob::pipeline::Options { + large_file_threshold_bytes: 4, + ..default_options() + }, + ); + + let does_not_matter = gix_hash::Kind::Sha1.null(); + let mut buf = Vec::new(); + let a_name = "a"; + let large_content = "hello"; + std::fs::write(tmp.path().join(a_name), large_content.as_bytes())?; + let out = filter.convert_to_diffable( + &does_not_matter, + EntryKind::BlobExecutable, + a_name.into(), + ResourceKind::NewOrDestination, + &mut |_, _| {}, + &gix_object::find::Never, + pipeline::Mode::default(), + &mut buf, + )?; + assert!(out.driver_index.is_none(), "there was no driver"); + assert_eq!(out.data, Some(pipeline::Data::Binary { size: 5 })); + assert_eq!(buf.len(), 0, "it should avoid querying that data in the first place"); + + // On windows, this test fails as it needs the link target to exist, and it was + // hard to make it exist with a relative path, strangely enough. + // For worktree checkouts, this works, that's all that matters for now. + if !cfg!(windows) { + let link_name = "link"; + gix_fs::symlink::create(large_content.as_ref(), &tmp.path().join(link_name))?; + let out = filter.convert_to_diffable( + &does_not_matter, + EntryKind::Link, + link_name.into(), + ResourceKind::NewOrDestination, + &mut |_, _| {}, + &gix_object::find::Never, + pipeline::Mode::default(), + &mut buf, + )?; + + assert!(out.driver_index.is_none()); + assert_eq!( + out.data, + Some(pipeline::Data::Buffer), + "links are always read and never considered large" + ); + assert_eq!(buf.as_bstr(), large_content); + } + drop(tmp); + + let mut db = ObjectDb::default(); + let id = db.insert(large_content); + + let out = filter.convert_to_diffable( + &id, + EntryKind::Blob, + a_name.into(), + ResourceKind::OldOrSource, + &mut |_, _| {}, + &db, + pipeline::Mode::default(), + &mut buf, + )?; + + assert!(out.driver_index.is_none(), "there was no driver"); + assert_eq!(out.data, Some(pipeline::Data::Binary { size: 5 })); + assert_eq!(buf.len(), 0, "it should avoid querying that data in the first place"); + + Ok(()) + } + + #[test] + fn non_existing() -> crate::Result { + let tmp = gix_testtools::tempfile::TempDir::new()?; + let mut filter = gix_diff::blob::Pipeline::new( + WorktreeRoots { + old_root: Some(tmp.path().to_owned()), + new_root: None, + }, + gix_filter::Pipeline::default(), + vec![], + default_options(), + ); + + let null = gix_hash::Kind::Sha1.null(); + let mut buf = vec![1]; + let a_name = "a"; + assert!( + !tmp.path().join(a_name).exists(), + "precondition: worktree file doesn't exist" + ); + let out = filter.convert_to_diffable( + &null, + EntryKind::Blob, + a_name.into(), + ResourceKind::OldOrSource, + &mut |_, _| {}, + &gix_object::find::Never, + pipeline::Mode::default(), + &mut buf, + )?; + assert!(out.driver_index.is_none(), "there was no driver"); + assert_eq!(out.data, None); + assert_eq!(buf.len(), 0, "always cleared"); + + buf.push(1); + let out = filter.convert_to_diffable( + &null, + EntryKind::Link, + "link".into(), + ResourceKind::OldOrSource, + &mut |_, _| {}, + &gix_object::find::Never, + pipeline::Mode::default(), + &mut buf, + )?; + assert!(out.driver_index.is_none()); + assert_eq!(out.data, None); + assert_eq!(buf.len(), 0, "always cleared"); + + drop(tmp); + + buf.push(1); + let out = filter.convert_to_diffable( + &null, + EntryKind::Blob, + a_name.into(), + ResourceKind::NewOrDestination, + &mut |_, _| {}, + &gix_object::find::Never, + pipeline::Mode::default(), + &mut buf, + )?; + + assert!(out.driver_index.is_none(), "there was no driver"); + assert_eq!(out.data, None); + assert_eq!(buf.len(), 0, "it's always cleared before any potential use"); + + Ok(()) + } + + #[test] + fn worktree_filter() -> crate::Result { + let tmp = gix_testtools::tempfile::TempDir::new()?; + let filter = gix_filter::Pipeline::new( + Default::default(), + gix_filter::pipeline::Options { + eol_config: eol::Configuration { + auto_crlf: AutoCrlf::Enabled, + ..Default::default() + }, + ..Default::default() + }, + ); + let mut filter = gix_diff::blob::Pipeline::new( + WorktreeRoots { + old_root: Some(tmp.path().to_owned()), + new_root: None, + }, + filter, + vec![], + default_options(), + ); + + let does_not_matter = gix_hash::Kind::Sha1.null(); + let mut buf = Vec::new(); + let a_name = "a"; + let a_content = "a-content\n"; + std::fs::write(tmp.path().join(a_name), a_content.as_bytes())?; + let out = filter.convert_to_diffable( + &does_not_matter, + EntryKind::Blob, + a_name.into(), + ResourceKind::OldOrSource, + &mut |_, _| {}, + &gix_object::find::Never, + pipeline::Mode::default(), + &mut buf, + )?; + assert!(out.driver_index.is_none(), "there was no driver"); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!( + buf.as_bstr(), + a_content, + "worktree files are assumed to be filtered already, and are verbatim" + ); + + let b_name = "b"; + let b_content = "a\r\nb"; + std::fs::write(tmp.path().join(b_name), b_content.as_bytes())?; + let out = filter.convert_to_diffable( + &does_not_matter, + EntryKind::Blob, + b_name.into(), + ResourceKind::OldOrSource, + &mut |_, _| {}, + &gix_object::find::Never, + pipeline::Mode::ToGit, + &mut buf, + )?; + assert!(out.driver_index.is_none(), "there was no driver"); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!( + buf.as_bstr(), + "a\nb", + "worktree files are converted back to git if the mode needs it" + ); + + // On windows, this test fails as it needs the link target to exist, and it's kind of impossible + // to test what we want to test apparently. + if !cfg!(windows) { + let link_name = "link"; + let link_content = "hello\n"; + gix_fs::symlink::create(link_content.as_ref(), &tmp.path().join(link_name))?; + let out = filter.convert_to_diffable( + &does_not_matter, + EntryKind::Link, + link_name.into(), + ResourceKind::OldOrSource, + &mut |_, _| {}, + &gix_object::find::Never, + pipeline::Mode::default(), + &mut buf, + )?; + + assert!(out.driver_index.is_none()); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!( + buf.as_bstr(), + link_content, + "links aren't put through worktree filters, otherwise it would have its newlines replaced" + ); + } + drop(tmp); + + let mut db = ObjectDb::default(); + let b_content = "b-content\n"; + let id = db.insert(b_content); + + let out = filter.convert_to_diffable( + &id, + EntryKind::Blob, + a_name.into(), + ResourceKind::NewOrDestination, + &mut |_, _| {}, + &db, + pipeline::Mode::default(), + &mut buf, + )?; + + assert!(out.driver_index.is_none(), "there was no driver"); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!(buf.as_bstr(), "b-content\r\n", "LF to CRLF by worktree filtering"); + + let mut db = ObjectDb::default(); + let b_content = "b\n"; + let id = db.insert(b_content); + let out = filter.convert_to_diffable( + &id, + EntryKind::Blob, + a_name.into(), + ResourceKind::NewOrDestination, + &mut |_, _| {}, + &db, + pipeline::Mode::ToGit, + &mut buf, + )?; + + assert!(out.driver_index.is_none(), "there was no driver"); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!(buf.as_bstr(), "b\n", "no filtering was performed at all"); + + Ok(()) + } + + #[test] + fn binary_by_buffer_inspection() -> crate::Result { + let tmp = gix_testtools::tempfile::TempDir::new()?; + let root = gix_testtools::scripted_fixture_read_only_standalone("make_blob_repo.sh")?; + let mut attributes = gix_worktree::Stack::new( + root, + gix_worktree::stack::State::AttributesStack(gix_worktree::stack::state::Attributes::new( + Default::default(), + None, + attributes::Source::WorktreeThenIdMapping, + Default::default(), + )), + gix_worktree::glob::pattern::Case::Sensitive, + Vec::new(), + Vec::new(), + ); + let mut filter = gix_diff::blob::Pipeline::new( + WorktreeRoots { + old_root: Some(tmp.path().to_owned()), + new_root: None, + }, + gix_filter::Pipeline::default(), + vec![gix_diff::blob::Driver { + name: "c".into(), + binary_to_text_command: Some("printf '\\0'; cat <".into()), + ..Default::default() + }], + default_options(), + ); + + let does_not_matter = gix_hash::Kind::Sha1.null(); + let mut buf = Vec::new(); + let a_name = "a"; + let a_content = "a\0b"; + std::fs::write(tmp.path().join(a_name), a_content.as_bytes())?; + let out = filter.convert_to_diffable( + &does_not_matter, + EntryKind::Blob, + a_name.into(), + ResourceKind::OldOrSource, + &mut |_, _| {}, + &gix_object::find::Never, + pipeline::Mode::default(), + &mut buf, + )?; + assert!(out.driver_index.is_none(), "there was no driver"); + assert_eq!(out.data, Some(pipeline::Data::Binary { size: 3 })); + assert_eq!(buf.len(), 0, "binary files aren't stored, even if we read them"); + + // LINK with null-bytes can't be created, and generally we ignore a lot of checks on links + // for good reason. Hard to test. + drop(tmp); + + let mut db = ObjectDb::default(); + let b_content = "b-co\0ntent\n"; + let id = db.insert(b_content); + + let out = filter.convert_to_diffable( + &id, + EntryKind::Blob, + a_name.into(), + ResourceKind::NewOrDestination, + &mut |_, _| {}, + &db, + pipeline::Mode::default(), + &mut buf, + )?; + + assert!(out.driver_index.is_none(), "there was no driver"); + assert_eq!(out.data, Some(pipeline::Data::Binary { size: 11 })); + assert_eq!(buf.len(), 0, "buffers are cleared even if we read them"); + + let platform = attributes.at_entry("c", Some(false), &gix_object::find::Never)?; + + let id = db.insert("b"); + let out = filter.convert_to_diffable( + &id, + EntryKind::Blob, + "c".into(), + ResourceKind::NewOrDestination, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &db, + pipeline::Mode::default(), + &mut buf, + )?; + + assert_eq!(out.driver_index, Some(0)); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!( + buf.as_bstr(), + "\0b", + "if binary-text-conversion is set, we don't care if it outputs zero bytes, let everything pass" + ); + + Ok(()) + } + + #[test] + fn with_driver() -> crate::Result { + let root = gix_testtools::scripted_fixture_read_only_standalone("make_blob_repo.sh")?; + let command = "echo to-text; cat <"; + let mut attributes = gix_worktree::Stack::new( + &root, + gix_worktree::stack::State::AttributesStack(gix_worktree::stack::state::Attributes::new( + Default::default(), + None, + attributes::Source::WorktreeThenIdMapping, + Default::default(), + )), + gix_worktree::glob::pattern::Case::Sensitive, + Vec::new(), + Vec::new(), + ); + let mut filter = gix_diff::blob::Pipeline::new( + WorktreeRoots { + old_root: Some(root.clone()), + new_root: None, + }, + gix_filter::Pipeline::default(), + vec![ + gix_diff::blob::Driver { + name: "a".into(), + binary_to_text_command: Some(command.into()), + ..Default::default() + }, + gix_diff::blob::Driver { + name: "b".into(), + is_binary: Some(true), + ..Default::default() + }, + gix_diff::blob::Driver { + name: "c".into(), + binary_to_text_command: Some(command.into()), + is_binary: Some(true), + ..Default::default() + }, + gix_diff::blob::Driver { + name: "d".into(), + binary_to_text_command: Some(command.into()), + ..Default::default() + }, + gix_diff::blob::Driver { + name: "missing".into(), + ..Default::default() + }, + ], + default_options(), + ); + + let mut db = ObjectDb::default(); + let null = gix_hash::Kind::Sha1.null(); + let mut buf = Vec::new(); + let platform = attributes.at_entry("a", Some(false), &gix_object::find::Never)?; + let worktree_modes = [ + pipeline::Mode::ToWorktreeAndBinaryToText, + pipeline::Mode::ToGitUnlessBinaryToTextIsPresent, + ]; + let all_modes = [ + pipeline::Mode::ToGit, + pipeline::Mode::ToWorktreeAndBinaryToText, + pipeline::Mode::ToGitUnlessBinaryToTextIsPresent, + ]; + for mode in worktree_modes { + let out = filter.convert_to_diffable( + &null, + EntryKind::Blob, + "a".into(), + ResourceKind::OldOrSource, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &gix_object::find::Never, + mode, + &mut buf, + )?; + assert_eq!(out.driver_index, Some(0)); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!(buf.as_bstr(), "to-text\na\n", "filter was applied"); + } + + let out = filter.convert_to_diffable( + &null, + EntryKind::Blob, + "a".into(), + ResourceKind::OldOrSource, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &gix_object::find::Never, + pipeline::Mode::ToGit, + &mut buf, + )?; + assert_eq!(out.driver_index, Some(0)); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!(buf.as_bstr(), "a\n", "unconditionally use git according to mode"); + + let id = db.insert("a\n"); + for mode in worktree_modes { + let out = filter.convert_to_diffable( + &id, + EntryKind::Blob, + "a".into(), + ResourceKind::NewOrDestination, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &db, + mode, + &mut buf, + )?; + assert_eq!(out.driver_index, Some(0)); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!(buf.as_bstr(), "to-text\na\n", "filter was applied"); + } + + let out = filter.convert_to_diffable( + &id, + EntryKind::Blob, + "a".into(), + ResourceKind::NewOrDestination, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &db, + pipeline::Mode::ToGit, + &mut buf, + )?; + assert_eq!(out.driver_index, Some(0)); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!( + buf.as_bstr(), + "a\n", + "no filter was applied in this mode, also when using the ODB" + ); + + let platform = attributes.at_entry("missing", Some(false), &gix_object::find::Never)?; + for mode in all_modes { + buf.push(1); + let out = filter.convert_to_diffable( + &null, + EntryKind::Link, + "missing".into(), /* does not actually exist */ + ResourceKind::OldOrSource, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &gix_object::find::Never, + mode, + &mut buf, + )?; + assert_eq!(out.driver_index, Some(4), "despite missing, we get driver information"); + assert_eq!(out.data, None); + assert_eq!(buf.len(), 0, "always cleared"); + + buf.push(1); + let out = filter.convert_to_diffable( + &null, + EntryKind::Link, + "missing".into(), /* does not actually exist */ + ResourceKind::NewOrDestination, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &gix_object::find::Never, + mode, + &mut buf, + )?; + assert_eq!(out.driver_index, Some(4), "despite missing, we get driver information"); + assert_eq!(out.data, None); + assert_eq!(buf.len(), 0, "always cleared"); + + buf.push(1); + let id = db.insert("link-target"); + let out = filter.convert_to_diffable( + &id, + EntryKind::Link, + "missing".into(), + ResourceKind::NewOrDestination, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &db, + mode, + &mut buf, + )?; + assert_eq!(out.driver_index, Some(4), "despite missing, we get driver information"); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!( + buf.as_bstr(), + "link-target", + "no matter what, links always look the same." + ); + } + + let platform = attributes.at_entry("b", Some(false), &gix_object::find::Never)?; + for mode in all_modes { + buf.push(1); + let out = filter.convert_to_diffable( + &null, + EntryKind::Blob, + "b".into(), + ResourceKind::OldOrSource, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &gix_object::find::Never, + mode, + &mut buf, + )?; + + assert_eq!(out.driver_index, Some(1)); + assert_eq!( + out.data, + Some(pipeline::Data::Binary { size: 2 }), + "binary value comes from driver, and it's always respected with worktree source" + ); + assert_eq!(buf.len(), 0, "it's always cleared before any potential use"); + } + + let id = db.insert("b\n"); + for mode in all_modes { + buf.push(1); + let out = filter.convert_to_diffable( + &id, + EntryKind::Blob, + "b".into(), + ResourceKind::NewOrDestination, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &db, + mode, + &mut buf, + )?; + + assert_eq!(out.driver_index, Some(1)); + assert_eq!( + out.data, + Some(pipeline::Data::Binary { size: 2 }), + "binary value comes from driver, and it's always respected with DB source" + ); + assert_eq!(buf.len(), 0, "it's always cleared before any potential use"); + } + + let platform = attributes.at_entry("c", Some(false), &gix_object::find::Never)?; + for mode in worktree_modes { + let out = filter.convert_to_diffable( + &null, + EntryKind::Blob, + "c".into(), + ResourceKind::OldOrSource, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &gix_object::find::Never, + mode, + &mut buf, + )?; + assert_eq!(out.driver_index, Some(2)); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!( + buf.as_bstr(), + "to-text\nc\n", + "filter was applied, it overrides binary=true" + ); + } + + let id = db.insert("c\n"); + for mode in worktree_modes { + let out = filter.convert_to_diffable( + &id, + EntryKind::Blob, + "c".into(), + ResourceKind::NewOrDestination, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &db, + mode, + &mut buf, + )?; + assert_eq!(out.driver_index, Some(2)); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!( + buf.as_bstr(), + "to-text\nc\n", + "filter was applied, it overrides binary=true" + ); + } + + let platform = attributes.at_entry("unset", Some(false), &gix_object::find::Never)?; + for mode in all_modes { + let out = filter.convert_to_diffable( + &null, + EntryKind::Blob, + "unset".into(), + ResourceKind::OldOrSource, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &gix_object::find::Never, + mode, + &mut buf, + )?; + assert_eq!( + out.driver_index, None, + "no driver is associated, as `diff` is explicitly unset" + ); + assert_eq!( + out.data, + Some(pipeline::Data::Binary { size: 6 }), + "unset counts as binary" + ); + assert_eq!(buf.len(), 0); + } + + let id = db.insert("unset\n"); + for mode in all_modes { + let out = filter.convert_to_diffable( + &id, + EntryKind::Blob, + "unset".into(), + ResourceKind::NewOrDestination, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &db, + mode, + &mut buf, + )?; + assert_eq!( + out.driver_index, None, + "no driver is associated, as `diff` is explicitly unset" + ); + assert_eq!( + out.data, + Some(pipeline::Data::Binary { size: 6 }), + "unset counts as binary" + ); + assert_eq!(buf.len(), 0); + } + + let platform = attributes.at_entry("d", Some(false), &gix_object::find::Never)?; + let id = db.insert("d-in-db"); + for mode in worktree_modes { + let out = filter.convert_to_diffable( + &null, + EntryKind::Blob, + "d".into(), + ResourceKind::OldOrSource, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &gix_object::find::Never, + mode, + &mut buf, + )?; + assert_eq!(out.driver_index, Some(3)); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!( + buf.as_bstr(), + "to-text\nd\n", + "the worktree + text conversion was triggered for worktree source" + ); + + let out = filter.convert_to_diffable( + &id, + EntryKind::Blob, + "d".into(), + ResourceKind::NewOrDestination, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &db, + mode, + &mut buf, + )?; + assert_eq!(out.driver_index, Some(3)); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!( + buf.as_bstr(), + "to-text\nd-in-db", + "the worktree + text conversion was triggered for db source" + ); + } + + let platform = attributes.at_entry("e-no-attr", Some(false), &gix_object::find::Never)?; + let out = filter.convert_to_diffable( + &null, + EntryKind::Blob, + "e-no-attr".into(), + ResourceKind::OldOrSource, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &gix_object::find::Never, + pipeline::Mode::ToGitUnlessBinaryToTextIsPresent, + &mut buf, + )?; + assert_eq!(out.driver_index, None); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!( + buf.as_bstr(), + "e\n", + "no text filter, so git conversion was applied for worktree source" + ); + + let id = db.insert("e-in-db"); + let out = filter.convert_to_diffable( + &id, + EntryKind::Blob, + "e-no-attr".into(), + ResourceKind::NewOrDestination, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &db, + pipeline::Mode::ToGitUnlessBinaryToTextIsPresent, + &mut buf, + )?; + assert_eq!(out.driver_index, None); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!( + buf.as_bstr(), + "e-in-db", + "no text filter, so git conversion was applied for ODB source" + ); + + Ok(()) + } + + pub(crate) fn default_options() -> Options { + Options { + large_file_threshold_bytes: 0, + fs: gix_fs::Capabilities { + precompose_unicode: false, + ignore_case: false, + executable_bit: true, + symlink: true, + }, + } + } +} diff --git a/gix-diff/tests/blob/platform.rs b/gix-diff/tests/blob/platform.rs new file mode 100644 index 00000000000..3a1fb57a9f0 --- /dev/null +++ b/gix-diff/tests/blob/platform.rs @@ -0,0 +1,373 @@ +use crate::blob::pipeline::convert_to_diffable::default_options; +use crate::hex_to_id; +use crate::util::ObjectDb; +use gix_diff::blob::platform::prepare_diff; +use gix_diff::blob::platform::prepare_diff::Operation; +use gix_diff::blob::{pipeline, platform, Algorithm, Platform, ResourceKind}; +use gix_object::bstr::{BString, ByteSlice}; +use gix_object::tree::EntryKind; +use gix_worktree::stack::state::attributes; + +#[test] +fn resources_of_worktree_and_odb_and_check_link() -> crate::Result { + let mut platform = new_platform( + Some(gix_diff::blob::Driver { + name: "a".into(), + ..Default::default() + }), + gix_diff::blob::pipeline::Mode::default(), + ); + platform.set_resource( + gix_hash::Kind::Sha1.null(), + EntryKind::Blob, + "a".into(), + ResourceKind::OldOrSource, + &gix_object::find::Never, + )?; + + let mut db = ObjectDb::default(); + let a_content = "a-content"; + let id = db.insert(a_content); + platform.set_resource( + id, + EntryKind::BlobExecutable, + "a".into(), + ResourceKind::NewOrDestination, + &db, + )?; + + let (old, new) = platform.resources().expect("previously set source and destination"); + assert_eq!(old.data.as_slice().expect("present").as_bstr(), "a\n"); + assert_eq!(old.driver_index, Some(0)); + assert_eq!(old.mode, EntryKind::Blob); + assert!(old.id.is_null(), "id is used verbatim"); + assert_eq!(old.rela_path, "a", "location is kept directly as provided"); + assert_eq!(new.data.as_slice().expect("present").as_bstr(), a_content); + assert_eq!(new.driver_index, Some(0)); + assert_eq!(new.mode, EntryKind::BlobExecutable); + assert_eq!(new.id, hex_to_id("4c469b6c8c4486fdc9ded9d597d8f6816a455707")); + assert_eq!(new.rela_path, "a", "location is kept directly as provided"); + + let out = platform.prepare_diff()?; + assert_eq!( + out.operation, + Operation::InternalDiff { + algorithm: Algorithm::Histogram + }, + "it ends up with the default, as it's not overridden anywhere" + ); + + assert_eq!( + comparable_ext_diff(platform.prepare_diff_command( + "test".into(), + gix_diff::command::Context { + git_dir: Some(".".into()), + ..Default::default() + }, + 2, + 3 + )), + format!("{}test a 0000000000000000000000000000000000000000 100644 4c469b6c8c4486fdc9ded9d597d8f6816a455707 100755", (!cfg!(windows)).then_some("GIT_DIFF_PATH_COUNTER=3 GIT_DIFF_PATH_TOTAL=3 GIT_DIR=. ").unwrap_or_default()), + "in this case, there is no rename-to field as last argument, it's based on the resource paths being different" + ); + + platform.set_resource(id, EntryKind::Link, "a".into(), ResourceKind::NewOrDestination, &db)?; + + // Double-inserts are fine. + platform.set_resource(id, EntryKind::Link, "a".into(), ResourceKind::NewOrDestination, &db)?; + let (old, new) = platform.resources().expect("previously set source and destination"); + assert_eq!( + old.data.as_slice().expect("present").as_bstr(), + "a\n", + "the source is still the same" + ); + assert_eq!(old.mode, EntryKind::Blob); + assert_eq!( + new.mode, + EntryKind::Link, + "but the destination has changed as is now a link" + ); + assert_eq!( + new.data.as_slice().expect("present").as_bstr(), + a_content, + "despite the same content" + ); + assert_eq!(new.id, hex_to_id("4c469b6c8c4486fdc9ded9d597d8f6816a455707")); + assert_eq!(new.rela_path, "a"); + + let out = platform.prepare_diff()?; + assert_eq!( + out.operation, + Operation::InternalDiff { + algorithm: Algorithm::Histogram + }, + "it would still diff, despite this being blob-with-link now. But that's fine." + ); + + assert_eq!( + comparable_ext_diff(platform.prepare_diff_command( + "test".into(), + gix_diff::command::Context { + git_dir: Some(".".into()), + ..Default::default() + }, + 0, + 1 + )), + format!("{}test a 0000000000000000000000000000000000000000 100644 4c469b6c8c4486fdc9ded9d597d8f6816a455707 120000", (!cfg!(windows)).then_some(r#"GIT_DIFF_PATH_COUNTER=1 GIT_DIFF_PATH_TOTAL=1 GIT_DIR=. "#).unwrap_or_default()), + "Also obvious that symlinks are definitely special, but it's what git does as well" + ); + + platform.clear_resource_cache(); + assert_eq!( + platform.resources(), + None, + "clearing the cache voids resources and one has to set it up again" + ); + + Ok(()) +} + +fn comparable_ext_diff( + cmd: Result< + gix_diff::blob::platform::prepare_diff_command::Command, + gix_diff::blob::platform::prepare_diff_command::Error, + >, +) -> String { + let cmd = cmd.expect("no error"); + let tokens = shell_words::split(&format!("{:?}", *cmd)).expect("parses fine"); + let ofs = if cfg!(windows) { 3 } else { 0 }; // Windows doesn't show env vars + tokens + .into_iter() + .enumerate() + .filter_map(|(idx, s)| { + (idx != (5 - ofs) && idx != (8 - ofs)) + .then_some(s) + .or_else(|| Some("".into())) + }) + .collect::>() + .join(" ") +} + +#[test] +fn diff_binary() -> crate::Result { + let mut platform = new_platform( + Some(gix_diff::blob::Driver { + name: "a".into(), + is_binary: Some(true), + ..Default::default() + }), + gix_diff::blob::pipeline::Mode::default(), + ); + platform.set_resource( + gix_hash::Kind::Sha1.null(), + EntryKind::Blob, + "a".into(), + ResourceKind::OldOrSource, + &gix_object::find::Never, + )?; + + let mut db = ObjectDb::default(); + let a_content = "b"; + let id = db.insert(a_content); + platform.set_resource(id, EntryKind::Blob, "b".into(), ResourceKind::NewOrDestination, &db)?; + + let out = platform.prepare_diff()?; + assert!( + matches!(out.operation, Operation::SourceOrDestinationIsBinary), + "one binary resource is enough to skip diffing entirely" + ); + + match platform.prepare_diff_command("test".into(), Default::default(), 0, 1) { + Err(err) => assert_eq!( + err.to_string(), + "Binary resources can't be diffed with an external command (as we don't have the data anymore)" + ), + Ok(_) => unreachable!("must error"), + } + + Ok(()) +} + +#[test] +fn diff_performed_despite_external_command() -> crate::Result { + let mut platform = new_platform( + Some(gix_diff::blob::Driver { + name: "a".into(), + command: Some("something-to-be-ignored".into()), + algorithm: Some(Algorithm::Myers), + ..Default::default() + }), + gix_diff::blob::pipeline::Mode::default(), + ); + platform.set_resource( + gix_hash::Kind::Sha1.null(), + EntryKind::Blob, + "a".into(), + ResourceKind::OldOrSource, + &gix_object::find::Never, + )?; + + let mut db = ObjectDb::default(); + let a_content = "b"; + let id = db.insert(a_content); + platform.set_resource(id, EntryKind::Blob, "b".into(), ResourceKind::NewOrDestination, &db)?; + + let out = platform.prepare_diff()?; + assert!( + matches!( + out.operation, + Operation::InternalDiff { + algorithm: Algorithm::Myers + } + ), + "by default, we prepare for internal diffs, unless external commands are enabled.\ + The caller could still obtain the command from here if they wanted to, as well.\ + Also, the algorithm is overridden by the source." + ); + Ok(()) +} + +#[test] +fn diff_skipped_due_to_external_command_and_enabled_option() -> crate::Result { + let command: BString = "something-to-be-ignored".into(); + let mut platform = new_platform( + Some(gix_diff::blob::Driver { + name: "a".into(), + command: Some(command.clone()), + algorithm: Some(Algorithm::Myers), + ..Default::default() + }), + gix_diff::blob::pipeline::Mode::default(), + ); + platform.options.skip_internal_diff_if_external_is_configured = true; + + platform.set_resource( + gix_hash::Kind::Sha1.null(), + EntryKind::Blob, + "a".into(), + ResourceKind::OldOrSource, + &gix_object::find::Never, + )?; + + let mut db = ObjectDb::default(); + let a_content = "b"; + let id = db.insert(a_content); + platform.set_resource(id, EntryKind::Blob, "b".into(), ResourceKind::NewOrDestination, &db)?; + + let out = platform.prepare_diff()?; + assert_eq!( + out.operation, + Operation::ExternalCommand { + command: command.as_ref() + }, + "now we provide all information that is needed to run the overridden diff command" + ); + Ok(()) +} + +#[test] +fn source_and_destination_do_not_exist() -> crate::Result { + let mut platform = new_platform(None, pipeline::Mode::default()); + platform.set_resource( + gix_hash::Kind::Sha1.null(), + EntryKind::Blob, + "missing".into(), + ResourceKind::OldOrSource, + &gix_object::find::Never, + )?; + + platform.set_resource( + gix_hash::Kind::Sha1.null(), + EntryKind::BlobExecutable, + "a".into(), + ResourceKind::NewOrDestination, + &gix_object::find::Never, + )?; + + let (old, new) = platform.resources().expect("previously set source and destination"); + assert_eq!(old.data, platform::resource::Data::Missing); + assert_eq!(old.driver_index, None); + assert_eq!(old.mode, EntryKind::Blob); + assert_eq!(new.data, platform::resource::Data::Missing); + assert_eq!(new.driver_index, None); + assert_eq!(new.mode, EntryKind::BlobExecutable); + + assert!(matches!( + platform.prepare_diff(), + Err(prepare_diff::Error::SourceAndDestinationRemoved) + )); + + assert_eq!( + format!( + "{:?}", + *platform + .prepare_diff_command( + "test".into(), + gix_diff::command::Context { + git_dir: Some(".".into()), + ..Default::default() + }, + 0, + 1 + ) + .expect("resources set") + ), + format!( + r#"{}"test" "missing" "/dev/null" "." "." "/dev/null" "." "." "a""#, + (!cfg!(windows)) + .then_some(r#"GIT_DIFF_PATH_COUNTER="1" GIT_DIFF_PATH_TOTAL="1" GIT_DIR="." "#) + .unwrap_or_default() + ) + ); + Ok(()) +} + +#[test] +fn invalid_resource_types() { + let mut platform = new_platform(None, pipeline::Mode::default()); + for (mode, name) in [(EntryKind::Commit, "Commit"), (EntryKind::Tree, "Tree")] { + assert_eq!( + platform + .set_resource( + gix_hash::Kind::Sha1.null(), + mode, + "a".into(), + ResourceKind::NewOrDestination, + &gix_object::find::Never, + ) + .unwrap_err() + .to_string(), + format!("Can only diff blobs and links, not {name}") + ); + } +} + +fn new_platform( + drivers: impl IntoIterator, + mode: gix_diff::blob::pipeline::Mode, +) -> Platform { + let root = gix_testtools::scripted_fixture_read_only_standalone("make_blob_repo.sh").expect("valid fixture"); + let attributes = gix_worktree::Stack::new( + &root, + gix_worktree::stack::State::AttributesStack(gix_worktree::stack::state::Attributes::new( + Default::default(), + None, + attributes::Source::WorktreeThenIdMapping, + Default::default(), + )), + gix_worktree::glob::pattern::Case::Sensitive, + Vec::new(), + Vec::new(), + ); + let filter = gix_diff::blob::Pipeline::new( + pipeline::WorktreeRoots { + old_root: Some(root.clone()), + new_root: None, + }, + gix_filter::Pipeline::default(), + drivers.into_iter().collect(), + default_options(), + ); + Platform::new(Default::default(), filter, mode, attributes) +} diff --git a/gix-diff/tests/diff.rs b/gix-diff/tests/diff.rs index 3ba4af8e15f..1a230f2575f 100644 --- a/gix-diff/tests/diff.rs +++ b/gix-diff/tests/diff.rs @@ -5,4 +5,54 @@ fn hex_to_id(hex: &str) -> gix_hash::ObjectId { } mod blob; +mod rewrites; mod tree; + +mod util { + use gix_hash::oid; + use gix_object::bstr::BString; + use gix_object::find::Error; + use std::collections::HashMap; + + #[derive(Default)] + pub struct ObjectDb { + data_by_id: HashMap, + } + + impl gix_object::FindHeader for ObjectDb { + fn try_header(&self, id: &oid) -> Result, Error> { + match self.data_by_id.get(&id.to_owned()) { + Some(data) => Ok(Some(gix_object::Header { + kind: gix_object::Kind::Blob, + size: data.len() as u64, + })), + None => Ok(None), + } + } + } + + impl gix_object::Find for ObjectDb { + fn try_find<'a>(&self, id: &oid, buffer: &'a mut Vec) -> Result>, Error> { + match self.data_by_id.get(&id.to_owned()) { + Some(data) => { + buffer.resize(data.len(), 0); + buffer.copy_from_slice(data); + Ok(Some(gix_object::Data { + kind: gix_object::Kind::Blob, + data: buffer.as_slice(), + })) + } + None => Ok(None), + } + } + } + + impl ObjectDb { + /// Insert `data` and return its hash. That can be used to find it again. + pub fn insert(&mut self, data: &str) -> gix_hash::ObjectId { + let id = gix_object::compute_hash(gix_hash::Kind::Sha1, gix_object::Kind::Blob, data.as_bytes()); + self.data_by_id.insert(id, data.into()); + id + } + } +} diff --git a/gix-diff/tests/fixtures/generated-archives/make_blob_repo.tar.xz b/gix-diff/tests/fixtures/generated-archives/make_blob_repo.tar.xz new file mode 100644 index 00000000000..515ebec8e50 --- /dev/null +++ b/gix-diff/tests/fixtures/generated-archives/make_blob_repo.tar.xz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7ada04b2d2df9b375d6d80499fee7eb3ff7ef5d03e9e3784376645fe533aea7 +size 10744 diff --git a/gix-diff/tests/fixtures/generated-archives/make_diff_repo.tar.xz b/gix-diff/tests/fixtures/generated-archives/make_diff_repo.tar.xz index a94a01f9e26..12b1c3e932f 100644 --- a/gix-diff/tests/fixtures/generated-archives/make_diff_repo.tar.xz +++ b/gix-diff/tests/fixtures/generated-archives/make_diff_repo.tar.xz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c37d85f4467ebe265761ffb989c0d924582e9fa5f660e1b3ad0365aca4599e84 -size 18468 +oid sha256:78430a7c39ab1337d311e3beb8988ffa1a131d40630b4e4662104a194823b596 +size 18440 diff --git a/gix-diff/tests/fixtures/make_blob_repo.sh b/gix-diff/tests/fixtures/make_blob_repo.sh new file mode 100644 index 00000000000..ac0079a68af --- /dev/null +++ b/gix-diff/tests/fixtures/make_blob_repo.sh @@ -0,0 +1,22 @@ +#!/bin/bash +set -eu -o pipefail + +git init -q + +echo a > a +echo b > b +echo c > c +echo d > d +echo e > e-no-attr +echo unset > unset + +cat <.gitattributes +a diff=a +b diff=b +c diff=c +d diff=d +missing diff=missing +unset -diff +EOF + +git add . && git commit -m "init" diff --git a/gix-diff/tests/fixtures/make_diff_repo.sh b/gix-diff/tests/fixtures/make_diff_repo.sh index 5b4859e9393..7ae9ccab22a 100755 --- a/gix-diff/tests/fixtures/make_diff_repo.sh +++ b/gix-diff/tests/fixtures/make_diff_repo.sh @@ -1,5 +1,4 @@ #!/bin/bash -set -x set -eu -o pipefail diff --git a/gix-diff/tests/rewrites/mod.rs b/gix-diff/tests/rewrites/mod.rs new file mode 100644 index 00000000000..be44ef39eda --- /dev/null +++ b/gix-diff/tests/rewrites/mod.rs @@ -0,0 +1,56 @@ +use gix_diff::rewrites::tracker::ChangeKind; +use gix_hash::{oid, ObjectId}; +use gix_object::tree::{EntryKind, EntryMode}; + +mod tracker; + +#[derive(Clone)] +pub struct Change { + id: ObjectId, + kind: ChangeKind, + mode: EntryMode, +} + +impl gix_diff::rewrites::tracker::Change for Change { + fn id(&self) -> &oid { + &self.id + } + + fn kind(&self) -> ChangeKind { + self.kind + } + + fn entry_mode(&self) -> EntryMode { + self.mode + } + + fn id_and_entry_mode(&self) -> (&oid, EntryMode) { + (&self.id, self.mode) + } +} + +const NULL_ID: gix_hash::ObjectId = gix_hash::Kind::Sha1.null(); + +impl Change { + fn modification() -> Self { + Change { + id: NULL_ID, + kind: ChangeKind::Modification, + mode: EntryKind::Blob.into(), + } + } + fn deletion() -> Self { + Change { + id: NULL_ID, + kind: ChangeKind::Deletion, + mode: EntryKind::Blob.into(), + } + } + fn addition() -> Self { + Change { + id: NULL_ID, + kind: ChangeKind::Addition, + mode: EntryKind::Blob.into(), + } + } +} diff --git a/gix-diff/tests/rewrites/tracker.rs b/gix-diff/tests/rewrites/tracker.rs new file mode 100644 index 00000000000..a50af02f277 --- /dev/null +++ b/gix-diff/tests/rewrites/tracker.rs @@ -0,0 +1,629 @@ +use crate::hex_to_id; +use crate::rewrites::{Change, NULL_ID}; +use gix_diff::blob::DiffLineStats; +use gix_diff::rewrites::tracker::visit::{Source, SourceKind}; +use gix_diff::rewrites::tracker::ChangeKind; +use gix_diff::rewrites::{Copies, CopySource}; +use gix_diff::tree::visit::Action; +use gix_diff::{rewrites, Rewrites}; +use gix_object::tree::EntryKind; + +#[test] +fn rename_by_id() -> crate::Result { + // Limits are only applied when doing rewrite-checks + for limit in [0, 1] { + let rewrites = Rewrites { + copies: None, + percentage: None, + limit, + }; + let mut track = util::new_tracker(rewrites); + assert!( + track.try_push_change(Change::modification(), "a".into()).is_some(), + "modifications play no role in rename tracking" + ); + assert!( + track.try_push_change(Change::deletion(), "b".into()).is_none(), + "recorded for later matching" + ); + assert!( + track.try_push_change(Change::addition(), "c".into()).is_none(), + "recorded for later matching" + ); + let mut called = false; + let out = util::assert_emit(&mut track, |dst, src| { + assert!(!called, "only one rename pair is expected"); + called = true; + assert_eq!( + src.unwrap(), + Source { + entry_mode: EntryKind::Blob.into(), + id: NULL_ID, + kind: SourceKind::Rename, + location: "b".into(), + diff: None, + } + ); + assert_eq!(dst.location, "c"); + Action::Continue + }); + assert_eq!( + out, + rewrites::Outcome { + options: rewrites, + ..Default::default() + }, + "no similarity check was performed, it was all matched by id" + ); + } + Ok(()) +} + +#[test] +fn copy_by_similarity_reports_limit_if_encountered() -> crate::Result { + let rewrites = Rewrites { + copies: Some(Copies { + source: CopySource::FromSetOfModifiedFiles, + percentage: Some(0.5), + }), + percentage: None, + limit: 1, + }; + let mut track = util::new_tracker(rewrites); + let odb = util::add_retained_blobs( + &mut track, + [ + (Change::modification(), "a", "a\n"), + (Change::addition(), "a-cpy-1", "a"), + (Change::addition(), "a-cpy-2", "a"), + (Change::modification(), "d", "ab"), + ], + ); + + let mut calls = 0; + let out = util::assert_emit_with_objects( + &mut track, + |dst, src| { + assert!(src.is_none()); + match calls { + 0 => assert_eq!(dst.location, "a"), + 1 => assert_eq!(dst.location, "a-cpy-1"), + 2 => assert_eq!(dst.location, "a-cpy-2"), + 3 => assert_eq!(dst.location, "d"), + _ => panic!("too many emissions"), + } + calls += 1; + Action::Continue + }, + odb, + ); + assert_eq!( + out, + rewrites::Outcome { + options: rewrites, + num_similarity_checks_skipped_for_copy_tracking_due_to_limit: 4, + ..Default::default() + }, + "no similarity check was performed at all - all or nothing" + ); + Ok(()) +} + +#[test] +fn copy_by_id() -> crate::Result { + // Limits are only applied when doing rewrite-checks + for limit in [0, 1] { + let rewrites = Rewrites { + copies: Some(Copies { + source: CopySource::FromSetOfModifiedFiles, + percentage: None, + }), + percentage: None, + limit, + }; + let mut track = util::new_tracker(rewrites); + let odb = util::add_retained_blobs( + &mut track, + [ + (Change::modification(), "a", "a"), + (Change::addition(), "a-cpy-1", "a"), + (Change::addition(), "a-cpy-2", "a"), + (Change::modification(), "d", "a"), + ], + ); + + let mut calls = 0; + let out = util::assert_emit_with_objects( + &mut track, + |dst, src| { + let source_a = Source { + entry_mode: EntryKind::Blob.into(), + id: hex_to_id("2e65efe2a145dda7ee51d1741299f848e5bf752e"), + kind: SourceKind::Copy, + location: "a".into(), + diff: None, + }; + match calls { + 0 => { + assert_eq!(src.unwrap(), source_a); + assert_eq!( + dst.location, "a-cpy-1", + "it just finds the first possible match in order, ignoring other candidates" + ); + } + 1 => { + assert_eq!(src.unwrap(), source_a, "copy-sources can be used multiple times"); + assert_eq!(dst.location, "a-cpy-2"); + } + 2 => { + assert!(src.is_none()); + assert_eq!(dst.location, "d"); + } + _ => panic!("too many emissions"), + } + calls += 1; + Action::Continue + }, + odb, + ); + assert_eq!( + out, + rewrites::Outcome { + options: rewrites, + ..Default::default() + }, + "no similarity check was performed, it was all matched by id" + ); + } + Ok(()) +} + +#[test] +fn copy_by_id_search_in_all_sources() -> crate::Result { + // Limits are only applied when doing rewrite-checks + for limit in [0, 1] { + let rewrites = Rewrites { + copies: Some(Copies { + source: CopySource::FromSetOfModifiedFilesAndAllSources, + percentage: None, + }), + percentage: None, + limit, + }; + let mut track = util::new_tracker(rewrites); + let odb = util::add_retained_blobs( + &mut track, + [ + (Change::addition(), "a-cpy-1", "a"), + (Change::addition(), "a-cpy-2", "a"), + ], + ); + + let mut calls = 0; + let content_id = hex_to_id("2e65efe2a145dda7ee51d1741299f848e5bf752e"); + let out = util::assert_emit_with_objects_and_sources( + &mut track, + |dst, src| { + let source_a = Source { + entry_mode: EntryKind::Blob.into(), + id: content_id, + kind: SourceKind::Copy, + location: "a-src".into(), + diff: None, + }; + match calls { + 0 => { + assert_eq!(src.unwrap(), source_a); + assert_eq!( + dst.location, "a-cpy-1", + "it just finds the first possible match in order, ignoring other candidates" + ); + } + 1 => { + assert_eq!(src.unwrap(), source_a, "copy-sources can be used multiple times"); + assert_eq!(dst.location, "a-cpy-2"); + } + 2 => { + assert!(src.is_none()); + assert_eq!(dst.location, "d"); + } + _ => panic!("too many emissions"), + } + calls += 1; + Action::Continue + }, + odb, + [( + { + let mut c = Change::modification(); + c.id = content_id; + c + }, + "a-src", + )], + ); + assert_eq!( + out, + rewrites::Outcome { + options: rewrites, + ..Default::default() + }, + "no similarity check was performed, it was all matched by id" + ); + } + Ok(()) +} + +#[test] +fn copy_by_50_percent_similarity() -> crate::Result { + let rewrites = Rewrites { + copies: Some(Copies { + source: CopySource::FromSetOfModifiedFiles, + percentage: Some(0.5), + }), + percentage: None, + limit: 0, + }; + let mut track = util::new_tracker(rewrites); + let odb = util::add_retained_blobs( + &mut track, + [ + (Change::modification(), "a", "a\n"), + (Change::addition(), "a-cpy-1", "a\nb"), + (Change::addition(), "a-cpy-2", "a\nc"), + (Change::modification(), "d", "a"), + ], + ); + + let mut calls = 0; + let out = util::assert_emit_with_objects( + &mut track, + |dst, src| { + let source_a = Source { + entry_mode: EntryKind::Blob.into(), + id: hex_to_id("78981922613b2afb6025042ff6bd878ac1994e85"), + kind: SourceKind::Copy, + location: "a".into(), + diff: Some(DiffLineStats { + removals: 0, + insertions: 1, + before: 1, + after: 2, + similarity: 0.6666667, + }), + }; + match calls { + 0 => { + assert_eq!( + src.unwrap(), + source_a, + "it finds the first possible source, no candidates" + ); + assert_eq!(dst.location, "a-cpy-1"); + } + 1 => { + assert_eq!(src.unwrap(), source_a, "the same source can be reused as well"); + assert_eq!(dst.location, "a-cpy-2"); + } + 2 => { + assert!(src.is_none()); + assert_eq!(dst.location, "d"); + } + _ => panic!("too many emissions"), + } + calls += 1; + Action::Continue + }, + odb, + ); + assert_eq!( + out, + rewrites::Outcome { + options: rewrites, + num_similarity_checks: 4, + ..Default::default() + }, + "no similarity check was performed, it was all matched by id" + ); + Ok(()) +} + +#[test] +fn copy_by_id_in_additions_only() -> crate::Result { + let rewrites = Rewrites { + copies: Some(Copies { + source: CopySource::FromSetOfModifiedFiles, + percentage: None, + }), + percentage: None, + limit: 0, + }; + let mut track = util::new_tracker(rewrites); + let odb = util::add_retained_blobs( + &mut track, + [ + (Change::modification(), "a", "a"), + (Change::modification(), "a-cpy-1", "a"), + ], + ); + + let mut calls = 0; + let out = util::assert_emit_with_objects( + &mut track, + |dst, src| { + match calls { + 0 => { + assert!(src.is_none()); + assert_eq!(dst.location, "a"); + } + 1 => { + assert!(src.is_none()); + assert_eq!( + dst.location, "a-cpy-1", + "copy detection is only done for additions, not within modifications" + ); + } + _ => panic!("too many emissions"), + } + calls += 1; + Action::Continue + }, + odb, + ); + assert_eq!( + out, + rewrites::Outcome { + options: rewrites, + ..Default::default() + }, + "no similarity check was performed, it was all matched by id" + ); + Ok(()) +} + +#[test] +fn rename_by_similarity_reports_limit_if_encountered() -> crate::Result { + let rewrites = Rewrites { + copies: None, + percentage: Some(0.5), + limit: 1, + }; + let mut track = util::new_tracker(rewrites); + let odb = util::add_retained_blobs( + &mut track, + [ + (Change::deletion(), "a", "first\nsecond\n"), + (Change::addition(), "b", "firt\nsecond\n"), + (Change::addition(), "c", "second\nunrelated\n"), + ], + ); + + let mut calls = 0; + let out = util::assert_emit_with_objects( + &mut track, + |dst, src| { + assert!(src.is_none()); + match calls { + 0 => assert_eq!(dst.location, "a"), + 1 => assert_eq!(dst.location, "b"), + 2 => assert_eq!(dst.location, "c"), + _ => panic!("too many elements emitted"), + }; + calls += 1; + Action::Continue + }, + odb, + ); + assert_eq!( + out, + rewrites::Outcome { + options: rewrites, + num_similarity_checks_skipped_for_rename_tracking_due_to_limit: 2, + ..Default::default() + }, + "no similarity check was performed at all - all or nothing" + ); + Ok(()) +} + +#[test] +fn rename_by_50_percent_similarity() -> crate::Result { + let rewrites = Rewrites { + copies: None, + percentage: Some(0.5), + limit: 0, + }; + let mut track = util::new_tracker(rewrites); + let odb = util::add_retained_blobs( + &mut track, + [ + (Change::deletion(), "a", "first\nsecond\n"), + (Change::addition(), "b", "firt\nsecond\n"), + (Change::addition(), "c", "second\nunrelated\n"), + ], + ); + + let mut calls = 0; + let out = util::assert_emit_with_objects( + &mut track, + |dst, src| { + match calls { + 0 => { + assert_eq!( + src.unwrap(), + Source { + entry_mode: EntryKind::Blob.into(), + id: hex_to_id("66a52ee7a1d803dc57859c3e95ac9dcdc87c0164"), + kind: SourceKind::Rename, + location: "a".into(), + diff: Some(DiffLineStats { + removals: 1, + insertions: 1, + before: 2, + after: 2, + similarity: 0.53846157 + }) + } + ); + assert_eq!(dst.location, "b"); + } + 1 => { + assert!(src.is_none(), "pair already found"); + assert_eq!(dst.location, "c"); + } + _ => panic!("too many elements emitted"), + }; + calls += 1; + Action::Continue + }, + odb, + ); + assert_eq!( + out, + rewrites::Outcome { + options: rewrites, + num_similarity_checks: 1, + ..Default::default() + }, + "the first attempt already yields the one pair, so it doesn't participate anymore\ + - we don't have best candidates yet, thus only one check" + ); + Ok(()) +} + +#[test] +fn remove_only() -> crate::Result { + let mut track = util::new_tracker(Default::default()); + assert!( + track.try_push_change(Change::deletion(), "a".into()).is_none(), + "recorded for later matching" + ); + let mut called = false; + let out = util::assert_emit(&mut track, |dst, src| { + assert!(!called); + called = true; + assert_eq!(src, None, "there is just a single deletion, no pair"); + assert_eq!(dst.location, "a"); + assert_eq!(dst.change.kind, ChangeKind::Deletion); + Action::Continue + }); + assert_eq!(out, Default::default()); + Ok(()) +} + +#[test] +fn add_only() -> crate::Result { + let mut track = util::new_tracker(Default::default()); + assert!( + track.try_push_change(Change::addition(), "a".into()).is_none(), + "recorded for later matching - note that this is the starting point of a matching run" + ); + let mut called = false; + let out = util::assert_emit(&mut track, |dst, src| { + assert!(!called); + called = true; + assert_eq!(src, None, "there is just a single deletion, no pair"); + assert_eq!(dst.location, "a"); + assert_eq!(dst.change.kind, ChangeKind::Addition); + Action::Continue + }); + assert_eq!(out, Default::default()); + Ok(()) +} + +mod util { + use crate::rewrites::Change; + use crate::util::ObjectDb; + use gix_diff::rewrites::tracker::visit::{Destination, Source}; + use gix_diff::tree::visit::Action; + use gix_diff::{rewrites, Rewrites}; + + /// Add `blobs` `(change, location, data)` to tracker that will all be retained. Note that the `id` of the respective change will be adjusted to match. + pub fn add_retained_blobs<'a>( + tracker: &mut rewrites::Tracker, + blobs: impl IntoIterator, + ) -> ObjectDb { + let mut db = ObjectDb::default(); + for (mut change, location, data) in blobs { + change.id = db.insert(data); + assert!( + tracker.try_push_change(change, location.into()).is_none(), + "input changes must be tracked" + ); + } + db + } + + pub fn assert_emit( + tracker: &mut rewrites::Tracker, + cb: impl FnMut(Destination<'_, Change>, Option>) -> Action, + ) -> rewrites::Outcome { + assert_emit_with_objects(tracker, cb, gix_object::find::Never) + } + + pub fn assert_emit_with_objects( + tracker: &mut rewrites::Tracker, + cb: impl FnMut(Destination<'_, Change>, Option>) -> Action, + objects: impl gix_object::FindObjectOrHeader, + ) -> rewrites::Outcome { + assert_emit_with_objects_and_sources(tracker, cb, objects, None) + } + + pub fn assert_emit_with_objects_and_sources<'a>( + tracker: &mut rewrites::Tracker, + cb: impl FnMut(Destination<'_, Change>, Option>) -> Action, + objects: impl gix_object::FindObjectOrHeader, + sources: impl IntoIterator, + ) -> rewrites::Outcome { + let mut sources: Vec<_> = sources.into_iter().collect(); + tracker + .emit( + cb, + &mut new_platform_no_worktree(), + &objects, + |cb| -> Result<(), std::io::Error> { + let sources = std::mem::take(&mut sources); + if sources.is_empty() { + panic!("Should not access more sources unless these are specified"); + } + for (src, location) in sources { + cb(src, location.into()); + } + Ok(()) + }, + ) + .expect("emit doesn't fail") + } + + pub fn new_tracker(rewrites: Rewrites) -> rewrites::Tracker { + rewrites::Tracker::new(rewrites) + } + + fn new_platform_no_worktree() -> gix_diff::blob::Platform { + let root = gix_testtools::scripted_fixture_read_only_standalone("make_blob_repo.sh").expect("valid fixture"); + let attributes = gix_worktree::Stack::new( + root, + gix_worktree::stack::State::AttributesStack(gix_worktree::stack::state::Attributes::new( + Default::default(), + None, + gix_worktree::stack::state::attributes::Source::IdMapping, + Default::default(), + )), + gix_worktree::glob::pattern::Case::Sensitive, + Vec::new(), + Vec::new(), + ); + let filter = gix_diff::blob::Pipeline::new( + Default::default(), + gix_filter::Pipeline::default(), + Vec::new(), + Default::default(), + ); + gix_diff::blob::Platform::new( + Default::default(), + filter, + gix_diff::blob::pipeline::Mode::ToGit, + attributes, + ) + } +} diff --git a/gix-filter/Cargo.toml b/gix-filter/Cargo.toml index ebf2682e18c..8a047b67e75 100644 --- a/gix-filter/Cargo.toml +++ b/gix-filter/Cargo.toml @@ -18,6 +18,7 @@ gix-trace = { version = "^0.1.3", path = "../gix-trace" } gix-object = { version = "^0.38.0", path = "../gix-object" } gix-command = { version = "^0.2.10", path = "../gix-command" } gix-quote = { version = "^0.4.7", path = "../gix-quote" } +gix-utils = { version = "^0.1.5", path = "../gix-utils" } gix-path = { version = "^0.10.0", path = "../gix-path" } gix-packetline = { package = "gix-packetline-blocking", version = "^0.16.6", path = "../gix-packetline-blocking" } gix-attributes = { version = "^0.20.0", path = "../gix-attributes" } diff --git a/gix-filter/src/lib.rs b/gix-filter/src/lib.rs index 5fba895fdb9..7c0a0c37ba2 100644 --- a/gix-filter/src/lib.rs +++ b/gix-filter/src/lib.rs @@ -10,6 +10,9 @@ //! or not to apply such a filter. #![deny(rust_2018_idioms, missing_docs, unsafe_code)] +/// The `gix-attributes` crate whose types are mentioned in the public API of [Pipeline::convert_to_worktree()]. +pub use gix_attributes as attributes; + use bstr::BString; /// A forwarding of the `encoding_rs` crate for its types and convenience. pub use encoding_rs as encoding; @@ -43,7 +46,7 @@ pub struct Pipeline { /// State needed to keep running filter processes. processes: driver::State, /// A utility to handle multiple buffers to keep results of various filters. - bufs: pipeline::util::Buffers, + bufs: gix_utils::Buffers, } /// A declaration of a driver program. diff --git a/gix-filter/src/pipeline/convert.rs b/gix-filter/src/pipeline/convert.rs index 85f6a9b1861..dac54940958 100644 --- a/gix-filter/src/pipeline/convert.rs +++ b/gix-filter/src/pipeline/convert.rs @@ -22,8 +22,7 @@ pub mod configuration { /// pub mod to_git { /// A function that fills `buf` `fn(&mut buf)` with the data stored in the index of the file that should be converted. - pub type IndexObjectFn<'a> = - dyn FnMut(&mut Vec) -> Result, Box> + 'a; + pub type IndexObjectFn<'a> = dyn FnMut(&mut Vec) -> Result, gix_object::find::Error> + 'a; /// The error returned by [Pipeline::convert_to_git()][super::Pipeline::convert_to_git()]. #[derive(Debug, thiserror::Error)] @@ -190,7 +189,7 @@ impl Pipeline { self.options.eol_config, )?; - let mut bufs = self.bufs.with_src(src); + let mut bufs = self.bufs.use_foreign_src(src); let (src, dest) = bufs.src_and_dest(); if apply_ident_filter && ident::apply(src, self.options.object_hash, dest) { bufs.swap(); diff --git a/gix-filter/src/pipeline/mod.rs b/gix-filter/src/pipeline/mod.rs index c5efbb5a019..662e14ca15d 100644 --- a/gix-filter/src/pipeline/mod.rs +++ b/gix-filter/src/pipeline/mod.rs @@ -52,18 +52,13 @@ const ATTRS: [&str; 6] = ["crlf", "ident", "filter", "eol", "text", "working-tre /// Lifecycle impl Pipeline { - /// Create a new pipeline with configured `drivers` (which should be considered safe to invoke) with `context` as well as - /// a way to initialize our attributes with `collection`. + /// Create a new pipeline with configured `drivers` (which should be considered safe to invoke), which are passed `context`. /// `eol_config` serves as fallback to understand how to convert line endings if no line-ending attributes are present. /// `crlf_roundtrip_check` corresponds to the git-configuration of `core.safecrlf`. /// `object_hash` is relevant for the `ident` filter. - pub fn new( - collection: &gix_attributes::search::MetadataCollection, - context: gix_command::Context, - options: Options, - ) -> Self { + pub fn new(context: gix_command::Context, options: Options) -> Self { let mut attrs = gix_attributes::search::Outcome::default(); - attrs.initialize_with_selection(collection, ATTRS); + attrs.initialize_with_selection(&Default::default(), ATTRS); Pipeline { attrs, context: Context::default(), @@ -83,8 +78,7 @@ impl Pipeline { impl Default for Pipeline { fn default() -> Self { - let collection = Default::default(); - Pipeline::new(&collection, Default::default(), Default::default()) + Pipeline::new(Default::default(), Default::default()) } } @@ -108,12 +102,14 @@ impl Pipeline { pub fn options_mut(&mut self) -> &mut Options { &mut self.options } + + /// Return our double-buffers for re-use by the caller. + pub fn buffers_mut(&mut self) -> &mut gix_utils::Buffers { + &mut self.bufs + } } /// pub mod convert; pub(crate) mod util; - -#[cfg(test)] -mod tests; diff --git a/gix-filter/src/pipeline/tests.rs b/gix-filter/src/pipeline/tests.rs deleted file mode 100644 index e41a040c604..00000000000 --- a/gix-filter/src/pipeline/tests.rs +++ /dev/null @@ -1,39 +0,0 @@ -mod buffers { - use bstr::ByteSlice; - - use crate::pipeline::util::Buffers; - - #[test] - fn usage() { - let mut backing = Buffers::default(); - let mut bufs = backing.with_src(b"a"); - - { - let (src, dest) = bufs.src_and_dest(); - assert_eq!(src.as_bstr(), "a"); - assert!(dest.is_empty()); - dest.push(b'b'); - } - assert!(bufs.ro_src.is_some(), "read-only source remains until swap"); - bufs.swap(); - assert!( - bufs.ro_src.is_none(), - "after swap it's not used anymore as recent data is in owned buffers" - ); - - let (src, dest) = bufs.src_and_dest(); - assert_eq!(src.as_bstr(), "b", "buffers were swapped"); - assert_eq!(dest.as_bstr(), "", "destination is new and cleared"); - dest.push(b'c'); - bufs.swap(); - - let (src, dest) = bufs.src_and_dest(); - assert_eq!(src.as_bstr(), "c"); - assert_eq!(dest.as_bstr(), "", "destination is cleared"); - - let mut bufs = backing.with_src(b"z"); - let (src, dest) = bufs.src_and_dest(); - assert_eq!(src.as_bstr(), "z"); - assert_eq!(dest.as_bstr(), "", "output buffer was cleared by `with_src()`") - } -} diff --git a/gix-filter/src/pipeline/util.rs b/gix-filter/src/pipeline/util.rs index ab10db08ec9..f83c37357bf 100644 --- a/gix-filter/src/pipeline/util.rs +++ b/gix-filter/src/pipeline/util.rs @@ -11,54 +11,6 @@ use crate::{ Driver, }; -/// A utility to do buffer-swapping with. -#[derive(Default, Clone)] -pub(crate) struct Buffers { - pub src: Vec, - pub dest: Vec, -} - -/// A utility to do buffer-swapping with. -pub(crate) struct BuffersWithSource<'src, 'bufs> { - pub ro_src: Option<&'src [u8]>, - pub src: &'bufs mut Vec, - pub dest: &'bufs mut Vec, -} - -impl Buffers { - pub fn with_src<'a, 'src>(&'a mut self, src: &'src [u8]) -> BuffersWithSource<'src, 'a> { - self.clear(); - BuffersWithSource { - ro_src: Some(src), - src: &mut self.src, - dest: &mut self.dest, - } - } - pub fn clear(&mut self) { - self.src.clear(); - self.dest.clear(); - } - - pub fn swap(&mut self) { - std::mem::swap(&mut self.src, &mut self.dest); - } -} - -impl BuffersWithSource<'_, '_> { - /// Must be called after every change (i.e. when it's known that `dest` was written. - pub fn swap(&mut self) { - self.ro_src.take(); - std::mem::swap(&mut self.src, &mut self.dest); - self.dest.clear(); - } - pub fn src_and_dest(&mut self) -> (&[u8], &mut Vec) { - match self.ro_src { - Some(src) => (src, &mut self.dest), - None => (self.src, &mut self.dest), - } - } -} - pub(crate) struct Configuration<'a> { pub(crate) driver: Option<&'a Driver>, /// What attributes say about CRLF handling. diff --git a/gix-filter/tests/driver/mod.rs b/gix-filter/tests/driver/mod.rs index c5a38847e3b..adca2d06095 100644 --- a/gix-filter/tests/driver/mod.rs +++ b/gix-filter/tests/driver/mod.rs @@ -138,17 +138,18 @@ pub(crate) mod apply { fn a_crashing_process_can_restart_it() -> crate::Result { let mut state = gix_filter::driver::State::default(); let driver = driver_with_process(); + let err = match state.apply( + &driver, + &mut std::io::empty(), + Operation::Smudge, + context_from_path("fail"), + ) { + Ok(_) => panic!("expecting an error as invalid context was passed"), + Err(err) => err, + }; assert!( - matches!( - state.apply( - &driver, - &mut std::io::empty(), - Operation::Smudge, - context_from_path("fail") - ), - Err(gix_filter::driver::apply::Error::ProcessInvoke { .. }) - ), - "cannot invoke if failure is requested" + matches!(err, gix_filter::driver::apply::Error::ProcessInvoke { .. }), + "{err:?}: cannot invoke if failure is requested" ); let mut filtered = state diff --git a/gix-filter/tests/pipeline/convert_to_git.rs b/gix-filter/tests/pipeline/convert_to_git.rs index 491c6043d6a..79d79993238 100644 --- a/gix-filter/tests/pipeline/convert_to_git.rs +++ b/gix-filter/tests/pipeline/convert_to_git.rs @@ -53,7 +53,7 @@ fn all_stages_mean_streaming_is_impossible() -> gix_testtools::Result { Path::new("any.txt"), &mut |path, attrs| { cache - .at_entry(path, Some(false), gix_object::find::Never) + .at_entry(path, Some(false), &gix_object::find::Never) .expect("cannot fail") .matching_attributes(attrs); }, @@ -82,7 +82,7 @@ fn only_driver_means_streaming_is_possible() -> gix_testtools::Result { Path::new("subdir/doesnot/matter/any.txt"), &mut |path, attrs| { cache - .at_entry(path, Some(false), gix_object::find::Never) + .at_entry(path, Some(false), &gix_object::find::Never) .expect("cannot fail") .matching_attributes(attrs); }, @@ -112,7 +112,7 @@ fn no_filter_means_reader_is_returned_unchanged() -> gix_testtools::Result { Path::new("other.txt"), &mut |path, attrs| { cache - .at_entry(path, Some(false), gix_object::find::Never) + .at_entry(path, Some(false), &gix_object::find::Never) .expect("cannot fail") .matching_attributes(attrs); }, diff --git a/gix-filter/tests/pipeline/convert_to_worktree.rs b/gix-filter/tests/pipeline/convert_to_worktree.rs index 8798f5a22c7..2449c50304a 100644 --- a/gix-filter/tests/pipeline/convert_to_worktree.rs +++ b/gix-filter/tests/pipeline/convert_to_worktree.rs @@ -21,7 +21,7 @@ fn all_stages() -> gix_testtools::Result { "any.txt".into(), &mut |path, attrs| { cache - .at_entry(path, Some(false), gix_object::find::Never) + .at_entry(path, Some(false), &gix_object::find::Never) .expect("cannot fail") .matching_attributes(attrs); }, @@ -54,7 +54,7 @@ fn all_stages_no_filter() -> gix_testtools::Result { "other.txt".into(), &mut |path, attrs| { cache - .at_entry(path, Some(false), gix_object::find::Never) + .at_entry(path, Some(false), &gix_object::find::Never) .expect("cannot fail") .matching_attributes(attrs); }, @@ -86,7 +86,7 @@ fn no_filter() -> gix_testtools::Result { "other.txt".into(), &mut |path, attrs| { cache - .at_entry(path, Some(false), gix_object::find::Never) + .at_entry(path, Some(false), &gix_object::find::Never) .expect("cannot fail") .matching_attributes(attrs); }, diff --git a/gix-filter/tests/pipeline/mod.rs b/gix-filter/tests/pipeline/mod.rs index ad84b7fb775..8c9d8fffcb1 100644 --- a/gix-filter/tests/pipeline/mod.rs +++ b/gix-filter/tests/pipeline/mod.rs @@ -58,7 +58,6 @@ fn pipeline( let cache = attribute_cache(name)?; let (drivers, encodings_with_roundtrip_check, crlf_roundtrip_check, eol_config) = init(); let pipe = gix_filter::Pipeline::new( - cache.attributes_collection(), Default::default(), gix_filter::pipeline::Options { drivers, diff --git a/gix-fs/src/stack.rs b/gix-fs/src/stack.rs index d90d662ca34..5d3dfeccd34 100644 --- a/gix-fs/src/stack.rs +++ b/gix-fs/src/stack.rs @@ -63,7 +63,6 @@ impl Stack { relative.is_relative(), "only index paths are handled correctly here, must be relative" ); - debug_assert!(!relative.to_string_lossy().is_empty(), "empty paths are not allowed"); if self.valid_components == 0 { delegate.push_directory(self)?; diff --git a/gix-fs/tests/stack/mod.rs b/gix-fs/tests/stack/mod.rs index 3e8cb7ade03..5e122cdb007 100644 --- a/gix-fs/tests/stack/mod.rs +++ b/gix-fs/tests/stack/mod.rs @@ -141,5 +141,16 @@ fn delegate_calls_are_consistent() -> crate::Result { "the stack is state so keeps thinking it's a directory which is consistent. Git does it differently though." ); + s.make_relative_path_current("".as_ref(), &mut r)?; + assert_eq!( + r, + Record { + push_dir: 9, + dirs: vec![".".into()], + push: 14, + }, + "empty-paths reset the tree effectively" + ); + Ok(()) } diff --git a/gix-hash/src/oid.rs b/gix-hash/src/oid.rs index b105621cb69..f9c74adfec3 100644 --- a/gix-hash/src/oid.rs +++ b/gix-hash/src/oid.rs @@ -1,6 +1,6 @@ use std::{convert::TryInto, fmt, hash}; -use crate::{ObjectId, SIZE_OF_SHA1_DIGEST}; +use crate::{Kind, ObjectId, SIZE_OF_SHA1_DIGEST}; /// A borrowed reference to a hash identifying objects. /// @@ -137,6 +137,15 @@ impl oid { hex_len: self.bytes.len() * 2, } } + + /// Returns `true` if this hash consists of all null bytes. + #[inline] + #[doc(alias = "is_zero", alias = "git2")] + pub fn is_null(&self) -> bool { + match self.kind() { + Kind::Sha1 => &self.bytes == oid::null_sha1().as_bytes(), + } + } } /// Sha1 specific methods diff --git a/gix-hash/tests/oid/mod.rs b/gix-hash/tests/oid/mod.rs index d8449e7a048..7b46e559492 100644 --- a/gix-hash/tests/oid/mod.rs +++ b/gix-hash/tests/oid/mod.rs @@ -13,3 +13,9 @@ mod to_hex_with_len { ); } } + +#[test] +fn is_null() { + assert!(gix_hash::Kind::Sha1.null().is_null()); + assert!(gix_hash::Kind::Sha1.null().as_ref().is_null()); +} diff --git a/gix-index/src/fs.rs b/gix-index/src/fs.rs index 21422f9b804..17abe795cad 100644 --- a/gix-index/src/fs.rs +++ b/gix-index/src/fs.rs @@ -5,7 +5,7 @@ // it's allowed for good measure, in case there are systems that use different types for that. use std::path::Path; -use std::time::{Duration, SystemTime}; +use std::time::SystemTime; /// A structure to partially mirror [`std::fs::Metadata`]. #[cfg(not(windows))] @@ -161,6 +161,7 @@ impl Metadata { } } +#[cfg(not(windows))] fn system_time_from_secs_nanos(secs: u64, nanos: u32) -> SystemTime { - std::time::UNIX_EPOCH + Duration::new(secs, nanos) + std::time::UNIX_EPOCH + std::time::Duration::new(secs, nanos) } diff --git a/gix-object/src/find.rs b/gix-object/src/find.rs index 271079eb412..8471592cbc9 100644 --- a/gix-object/src/find.rs +++ b/gix-object/src/find.rs @@ -66,6 +66,12 @@ pub mod existing_iter { #[derive(Debug, Copy, Clone)] pub struct Never; +impl super::FindHeader for Never { + fn try_header(&self, _id: &gix_hash::oid) -> Result, Error> { + Ok(None) + } +} + impl super::Find for Never { fn try_find<'a>(&self, _id: &gix_hash::oid, _buffer: &'a mut Vec) -> Result>, Error> { Ok(None) diff --git a/gix-object/src/lib.rs b/gix-object/src/lib.rs index 661397ca36d..85ff8708e11 100644 --- a/gix-object/src/lib.rs +++ b/gix-object/src/lib.rs @@ -34,7 +34,7 @@ pub mod data; pub mod find; mod traits; -pub use traits::{Exists, Find, FindExt, WriteTo}; +pub use traits::{Exists, Find, FindExt, FindObjectOrHeader, Header as FindHeader, HeaderExt, WriteTo}; pub mod encode; pub(crate) mod parse; @@ -257,6 +257,15 @@ pub struct Data<'a> { pub data: &'a [u8], } +/// Information about an object, which includes its kind and the amount of bytes it would have when obtained. +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] +pub struct Header { + /// The kind of object. + pub kind: Kind, + /// The object's size in bytes, or the size of the buffer when it's retrieved in full. + pub size: u64, +} + /// pub mod decode { #[cfg(feature = "verbose-object-parsing-errors")] diff --git a/gix-object/src/traits.rs b/gix-object/src/traits.rs index d48bc195d1a..e12215bef4c 100644 --- a/gix-object/src/traits.rs +++ b/gix-object/src/traits.rs @@ -71,6 +71,17 @@ mod find { ) -> Result>, find::Error>; } + /// Find the header of an object in the object store. + pub trait Header { + /// Find the header of the object matching `id` in the database. + /// + /// Returns `Some` header if it was present, or the error that occurred during lookup. + fn try_header(&self, id: &gix_hash::oid) -> Result, find::Error>; + } + + /// A combination of [`Find`] and [`Header`] traits to help with `dyn` trait objects. + pub trait FindObjectOrHeader: Find + Header {} + mod _impls { use std::{ops::Deref, rc::Rc, sync::Arc}; @@ -86,6 +97,8 @@ mod find { } } + impl crate::FindObjectOrHeader for T where T: crate::Find + crate::FindHeader {} + impl crate::Find for &T where T: crate::Find, @@ -95,6 +108,15 @@ mod find { } } + impl crate::FindHeader for &T + where + T: crate::FindHeader, + { + fn try_header(&self, id: &gix_hash::oid) -> Result, crate::find::Error> { + (*self).try_header(id) + } + } + impl crate::Exists for Box where T: crate::Exists, @@ -122,6 +144,15 @@ mod find { } } + impl crate::FindHeader for Rc + where + T: crate::FindHeader, + { + fn try_header(&self, id: &gix_hash::oid) -> Result, crate::find::Error> { + self.deref().try_header(id) + } + } + impl crate::Find for Box where T: crate::Find, @@ -131,6 +162,15 @@ mod find { } } + impl crate::FindHeader for Box + where + T: crate::FindHeader, + { + fn try_header(&self, id: &gix_hash::oid) -> Result, crate::find::Error> { + self.deref().try_header(id) + } + } + impl crate::Exists for Arc where T: crate::Exists, @@ -148,6 +188,15 @@ mod find { self.deref().try_find(id, buffer) } } + + impl crate::FindHeader for Arc + where + T: crate::FindHeader, + { + fn try_header(&self, id: &gix_hash::oid) -> Result, crate::find::Error> { + self.deref().try_header(id) + } + } } mod ext { @@ -214,9 +263,19 @@ mod find { }; } + /// An extension trait with convenience functions. + pub trait HeaderExt: super::Header { + /// Like [`try_header(…)`](super::Header::try_header()), but flattens the `Result>` into a single `Result` making a non-existing header an error. + fn header(&self, id: &gix_hash::oid) -> Result { + self.try_header(id) + .map_err(find::existing::Error::Find)? + .ok_or_else(|| find::existing::Error::NotFound { oid: id.to_owned() }) + } + } + /// An extension trait with convenience functions. pub trait FindExt: super::Find { - /// Like [`try_find(…)`][super::Find::try_find()], but flattens the `Result>` into a single `Result` making a non-existing object an error. + /// Like [`try_find(…)`](super::Find::try_find()), but flattens the `Result>` into a single `Result` making a non-existing object an error. fn find<'a>( &self, id: &gix_hash::oid, @@ -238,6 +297,6 @@ mod find { impl FindExt for T {} } - pub use ext::FindExt; + pub use ext::{FindExt, HeaderExt}; } pub use find::*; diff --git a/gix-odb/src/cache.rs b/gix-odb/src/cache.rs index aaa99756301..cc737041e63 100644 --- a/gix-odb/src/cache.rs +++ b/gix-odb/src/cache.rs @@ -177,6 +177,15 @@ mod impls { } } + impl gix_object::FindHeader for Cache + where + S: gix_object::FindHeader, + { + fn try_header(&self, id: &oid) -> Result, gix_object::find::Error> { + self.inner.try_header(id) + } + } + impl gix_pack::Find for Cache where S: gix_pack::Find, diff --git a/gix-odb/src/store_impls/dynamic/find.rs b/gix-odb/src/store_impls/dynamic/find.rs index b57b5fe830e..99b58d7477f 100644 --- a/gix-odb/src/store_impls/dynamic/find.rs +++ b/gix-odb/src/store_impls/dynamic/find.rs @@ -78,7 +78,6 @@ pub use error::Error; use gix_features::zlib; use crate::store::types::PackId; -use gix_object::{Exists, Find}; impl super::Handle where @@ -499,7 +498,7 @@ where } } -impl Find for super::Handle +impl gix_object::Find for super::Handle where S: Deref + Clone, Self: gix_pack::Find, @@ -513,7 +512,25 @@ where } } -impl Exists for super::Handle +impl gix_object::FindHeader for super::Handle +where + S: Deref + Clone, +{ + fn try_header(&self, id: &gix_hash::oid) -> Result, gix_object::find::Error> { + let mut snapshot = self.snapshot.borrow_mut(); + let mut inflate = self.inflate.borrow_mut(); + self.try_header_inner(id, &mut inflate, &mut snapshot, None) + .map(|maybe_header| { + maybe_header.map(|hdr| gix_object::Header { + kind: hdr.kind(), + size: hdr.size(), + }) + }) + .map_err(|err| Box::new(err) as _) + } +} + +impl gix_object::Exists for super::Handle where S: Deref + Clone, Self: gix_pack::Find, diff --git a/gix-odb/src/store_impls/dynamic/header.rs b/gix-odb/src/store_impls/dynamic/header.rs index 42fd51c85ba..2a331a094c5 100644 --- a/gix-odb/src/store_impls/dynamic/header.rs +++ b/gix-odb/src/store_impls/dynamic/header.rs @@ -13,7 +13,7 @@ impl super::Handle where S: Deref + Clone, { - fn try_header_inner<'b>( + pub(crate) fn try_header_inner<'b>( &'b self, mut id: &'b gix_hash::oid, inflate: &mut zlib::Inflate, diff --git a/gix-status/src/stack.rs b/gix-status/src/stack.rs index 3f3cd8f9797..b97a1c4e139 100644 --- a/gix-status/src/stack.rs +++ b/gix-status/src/stack.rs @@ -34,6 +34,7 @@ impl gix_fs::stack::Delegate for Delegate { Ok(()) } + #[cfg_attr(windows, allow(unused_variables))] fn push(&mut self, is_last_component: bool, stack: &Stack) -> std::io::Result<()> { #[cfg(windows)] { diff --git a/gix-url/Cargo.toml b/gix-url/Cargo.toml index 0dd0861b023..a14947405ec 100644 --- a/gix-url/Cargo.toml +++ b/gix-url/Cargo.toml @@ -22,7 +22,7 @@ gix-path = { version = "^0.10.0", path = "../gix-path" } serde = { version = "1.0.114", optional = true, default-features = false, features = ["std", "derive"]} thiserror = "1.0.32" -url = "2.1.1" +url = "2.2.0" bstr = { version = "1.3.0", default-features = false, features = ["std"] } home = "0.5.3" diff --git a/gix-utils/src/buffers.rs b/gix-utils/src/buffers.rs new file mode 100644 index 00000000000..2c0544ea3da --- /dev/null +++ b/gix-utils/src/buffers.rs @@ -0,0 +1,54 @@ +use crate::Buffers; + +/// Lifecycle +impl Buffers { + /// Use this if there is an input buffer `src` which isn't owned by you, but which should be used as source when + /// asking for [`src_and_dest()`](WithForeignSource::src_and_dest()). + pub fn use_foreign_src<'a, 'src>(&'a mut self, src: &'src [u8]) -> WithForeignSource<'src, 'a> { + self.clear(); + WithForeignSource { + ro_src: Some(src), + src: &mut self.src, + dest: &mut self.dest, + } + } +} + +impl Buffers { + /// Clear all buffers, which should be called once processing is done. + pub fn clear(&mut self) { + self.src.clear(); + self.dest.clear(); + } + + /// Must be called after every change (i.e. when it's known that `dest` was written. + pub fn swap(&mut self) { + std::mem::swap(&mut self.src, &mut self.dest); + } +} + +/// A utility to do buffer-swapping with, similar to [`Buffers`], but with support for a +/// read-only one-time buffer as source. +pub struct WithForeignSource<'src, 'bufs> { + /// The original source buffer, or `None` if already altered. + pub ro_src: Option<&'src [u8]>, + /// The source buffer that will be used after the first call to `swap`. + pub src: &'bufs mut Vec, + dest: &'bufs mut Vec, +} + +impl<'bufs> WithForeignSource<'_, 'bufs> { + /// Must be called after every change (i.e. when it's known that `dest` was written. + pub fn swap(&mut self) { + self.ro_src.take(); + std::mem::swap(&mut self.src, &mut self.dest); + self.dest.clear(); + } + /// Obtain `(source, destination)`, which reads from the read-only source exactly once. + pub fn src_and_dest(&mut self) -> (&[u8], &mut Vec) { + match self.ro_src { + Some(src) => (src, &mut self.dest), + None => (self.src, &mut self.dest), + } + } +} diff --git a/gix-utils/src/lib.rs b/gix-utils/src/lib.rs index 4c9d99fa423..f5951b10451 100644 --- a/gix-utils/src/lib.rs +++ b/gix-utils/src/lib.rs @@ -6,3 +6,20 @@ /// pub mod backoff; + +/// +pub mod buffers; + +/// A utility to do buffer-swapping with. +/// +/// Use `src` to read from and `dest` to write to, and after actually changing data, call [Buffers::swap()]. +/// To be able to repeat the process, this time using what was `dest` as `src`, freeing up `dest` for writing once more. +/// +/// Note that after each [`Buffers::swap()`], `src` is the most recent version of the data, just like before each swap. +#[derive(Default, Clone)] +pub struct Buffers { + /// The source data, as basis for processing. + pub src: Vec, + /// The data produced after processing `src`. + pub dest: Vec, +} diff --git a/gix-utils/tests/buffers/mod.rs b/gix-utils/tests/buffers/mod.rs new file mode 100644 index 00000000000..7d7eb9f8dad --- /dev/null +++ b/gix-utils/tests/buffers/mod.rs @@ -0,0 +1,33 @@ +use gix_utils::Buffers; + +#[test] +fn lifecycle() { + let mut bufs = Buffers::default(); + let mut bufs = bufs.use_foreign_src(b"a"); + + assert_eq!(bufs.ro_src.unwrap(), b"a"); + + let (src, dest) = bufs.src_and_dest(); + + assert_eq!(src, b"a"); + assert_eq!(dest.len(), 0); + dest.push(b'b'); + + bufs.swap(); + + assert_eq!(bufs.src, b"b"); + assert!(bufs.ro_src.is_none()); + + let (src, dest) = bufs.src_and_dest(); + assert_eq!(src, b"b"); + assert_eq!( + dest.len(), + 0, + "the original previously empty source buffer was swapped in " + ); + dest.push(b'c'); + bufs.swap(); + let (src, dest) = bufs.src_and_dest(); + assert_eq!(src, b"c"); + assert_eq!(dest.len(), 0, "dest always starting out empty"); +} diff --git a/gix-utils/tests/utils.rs b/gix-utils/tests/utils.rs index ed38d3e98c6..8face990aab 100644 --- a/gix-utils/tests/utils.rs +++ b/gix-utils/tests/utils.rs @@ -1 +1,2 @@ mod backoff; +mod buffers; diff --git a/gix-worktree-stream/tests/stream.rs b/gix-worktree-stream/tests/stream.rs index 9d341ca15ad..b84859b7cd8 100644 --- a/gix-worktree-stream/tests/stream.rs +++ b/gix-worktree-stream/tests/stream.rs @@ -261,7 +261,6 @@ mod from_tree { fn mutating_pipeline(driver: bool) -> gix_filter::Pipeline { gix_filter::Pipeline::new( - &Default::default(), Default::default(), gix_filter::pipeline::Options { drivers: if driver { vec![driver_with_process()] } else { vec![] }, diff --git a/gix-worktree/src/lib.rs b/gix-worktree/src/lib.rs index 3e6fc19d503..26464956615 100644 --- a/gix-worktree/src/lib.rs +++ b/gix-worktree/src/lib.rs @@ -10,7 +10,8 @@ #![cfg_attr(all(doc, feature = "document-features"), feature(doc_cfg, doc_auto_cfg))] #![deny(missing_docs, rust_2018_idioms, unsafe_code)] use bstr::BString; -pub use gix_glob; +/// A way to access the [`Case`](glob::pattern::Case) enum which used throughout this API. +pub use gix_glob as glob; /// A cache for efficiently executing operations on directories and files which are encountered in sorted order. /// That way, these operations can be re-used for subsequent invocations in the same directory. diff --git a/gix-worktree/src/stack/mod.rs b/gix-worktree/src/stack/mod.rs index 387acf2acff..b6c03d175bf 100644 --- a/gix-worktree/src/stack/mod.rs +++ b/gix-worktree/src/stack/mod.rs @@ -108,22 +108,19 @@ impl Stack { /// `objects` maybe used to lookup objects from an [id mapping][crate::stack::State::id_mappings_from_index()], with mappnigs /// /// Provide access to cached information for that `relative` path via the returned platform. - pub fn at_path( + pub fn at_path( &mut self, relative: impl AsRef, is_dir: Option, - objects: Find, - ) -> std::io::Result> - where - Find: gix_object::Find, - { + objects: &dyn gix_object::Find, + ) -> std::io::Result> { self.statistics.platforms += 1; let mut delegate = StackDelegate { state: &mut self.state, buf: &mut self.buf, is_dir: is_dir.unwrap_or(false), id_mappings: &self.id_mappings, - objects: &objects, + objects, case: self.case, statistics: &mut self.statistics, }; @@ -142,15 +139,12 @@ impl Stack { /// ### Panics /// /// on illformed UTF8 in `relative` - pub fn at_entry<'r, Find>( + pub fn at_entry<'r>( &mut self, relative: impl Into<&'r BStr>, is_dir: Option, - objects: Find, - ) -> std::io::Result> - where - Find: gix_object::Find, - { + objects: &dyn gix_object::Find, + ) -> std::io::Result> { let relative = relative.into(); let relative_path = gix_path::from_bstr(relative); diff --git a/gix-worktree/tests/worktree/stack/attributes.rs b/gix-worktree/tests/worktree/stack/attributes.rs index d595a9e2860..e4fda10e905 100644 --- a/gix-worktree/tests/worktree/stack/attributes.rs +++ b/gix-worktree/tests/worktree/stack/attributes.rs @@ -29,7 +29,7 @@ fn baseline() -> crate::Result { let mut actual = cache.attribute_matches(); let input = std::fs::read(base.join("baseline"))?; for (rela_path, expected) in (baseline::Expectations { lines: input.lines() }) { - let entry = cache.at_entry(rela_path, None, gix_object::find::Never)?; + let entry = cache.at_entry(rela_path, None, &gix_object::find::Never)?; let has_match = entry.matching_attributes(&mut actual); assert_eq!( @@ -50,7 +50,7 @@ fn baseline() -> crate::Result { let mut actual = cache.selected_attribute_matches(["info", "test"]); let input = std::fs::read(base.join("baseline.selected"))?; for (rela_path, expected) in (baseline::Expectations { lines: input.lines() }) { - let entry = cache.at_entry(rela_path, None, gix_object::find::Never)?; + let entry = cache.at_entry(rela_path, None, &gix_object::find::Never)?; let has_match = entry.matching_attributes(&mut actual); assert_eq!( diff --git a/gix-worktree/tests/worktree/stack/create_directory.rs b/gix-worktree/tests/worktree/stack/create_directory.rs index 4ae15cf70f6..19130d6559f 100644 --- a/gix-worktree/tests/worktree/stack/create_directory.rs +++ b/gix-worktree/tests/worktree/stack/create_directory.rs @@ -15,7 +15,7 @@ fn root_is_assumed_to_exist_and_files_in_root_do_not_create_directory() -> crate ); assert_eq!(cache.statistics().delegate.num_mkdir_calls, 0); - let path = cache.at_path("hello", Some(false), gix_object::find::Never)?.path(); + let path = cache.at_path("hello", Some(false), &gix_object::find::Never)?.path(); assert!(!path.parent().unwrap().exists(), "prefix itself is never created"); assert_eq!(cache.statistics().delegate.num_mkdir_calls, 0); Ok(()) @@ -33,7 +33,7 @@ fn directory_paths_are_created_in_full() { ("link", None), ] { let path = cache - .at_path(Path::new("dir").join(name), *is_dir, gix_object::find::Never) + .at_path(Path::new("dir").join(name), *is_dir, &gix_object::find::Never) .unwrap() .path(); assert!(path.parent().unwrap().is_dir(), "dir exists"); @@ -47,7 +47,7 @@ fn existing_directories_are_fine() -> crate::Result { let (mut cache, tmp) = new_cache(); std::fs::create_dir(tmp.path().join("dir"))?; - let path = cache.at_path("dir/file", Some(false), gix_object::find::Never)?.path(); + let path = cache.at_path("dir/file", Some(false), &gix_object::find::Never)?.path(); assert!(path.parent().unwrap().is_dir(), "directory is still present"); assert!(!path.exists(), "it won't create the file"); assert_eq!(cache.statistics().delegate.num_mkdir_calls, 1); @@ -72,7 +72,7 @@ fn symlinks_or_files_in_path_are_forbidden_or_unlinked_when_forced() -> crate::R let relative_path = format!("{dirname}/file"); assert_eq!( cache - .at_path(&relative_path, Some(false), gix_object::find::Never) + .at_path(&relative_path, Some(false), &gix_object::find::Never) .unwrap_err() .kind(), std::io::ErrorKind::AlreadyExists @@ -93,7 +93,7 @@ fn symlinks_or_files_in_path_are_forbidden_or_unlinked_when_forced() -> crate::R } let relative_path = format!("{dirname}/file"); let path = cache - .at_path(&relative_path, Some(false), gix_object::find::Never)? + .at_path(&relative_path, Some(false), &gix_object::find::Never)? .path(); assert!(path.parent().unwrap().is_dir(), "directory was forcefully created"); assert!(!path.exists()); diff --git a/gix-worktree/tests/worktree/stack/ignore.rs b/gix-worktree/tests/worktree/stack/ignore.rs index 6dbcecb5cb8..7de236335a6 100644 --- a/gix-worktree/tests/worktree/stack/ignore.rs +++ b/gix-worktree/tests/worktree/stack/ignore.rs @@ -65,7 +65,7 @@ fn exclude_by_dir_is_handled_just_like_git() { let relative_path = gix_path::from_byte_slice(relative_entry); let is_dir = dir.join(relative_path).metadata().ok().map(|m| m.is_dir()); - let platform = cache.at_entry(relative_entry, is_dir, FindError).unwrap(); + let platform = cache.at_entry(relative_entry, is_dir, &FindError).unwrap(); let match_ = platform.matching_exclude_pattern().expect("match all values"); let _is_excluded = platform.is_excluded(); assert_eq!( diff --git a/gix/Cargo.toml b/gix/Cargo.toml index 5f566b71a9e..db3dfa225ad 100644 --- a/gix/Cargo.toml +++ b/gix/Cargo.toml @@ -95,7 +95,7 @@ revparse-regex = ["regex", "revision"] ## Make it possible to diff blobs line by line. Note that this feature is integral for implementing tree-diffs as well due to the handling of rename-tracking, ## which relies on line-by-line diffs in some cases. -blob-diff = ["gix-diff/blob"] +blob-diff = ["gix-diff/blob", "attributes"] ## Make it possible to turn a tree into a stream of bytes, which can be decoded to entries and turned into various other formats. worktree-stream = ["gix-worktree-stream", "attributes"] diff --git a/gix/src/config/cache/access.rs b/gix/src/config/cache/access.rs index 5031501d3f1..feeb241ba30 100644 --- a/gix/src/config/cache/access.rs +++ b/gix/src/config/cache/access.rs @@ -39,6 +39,97 @@ impl Cache { .copied() } + #[cfg(feature = "blob-diff")] + pub(crate) fn diff_drivers(&self) -> Result, config::diff::drivers::Error> { + use crate::config::cache::util::ApplyLeniencyDefault; + let mut out = Vec::::new(); + for section in self + .resolved + .sections_by_name("diff") + .into_iter() + .flatten() + .filter(|s| (self.filter_config_section)(s.meta())) + { + let Some(name) = section.header().subsection_name().filter(|n| !n.is_empty()) else { + continue; + }; + + let driver = match out.iter_mut().find(|d| d.name == name) { + Some(existing) => existing, + None => { + out.push(gix_diff::blob::Driver { + name: name.into(), + ..Default::default() + }); + out.last_mut().expect("just pushed") + } + }; + + if let Some(binary) = section.value_implicit("binary") { + driver.is_binary = config::tree::Diff::DRIVER_BINARY + .try_into_binary(binary) + .with_leniency(self.lenient_config) + .map_err(|err| config::diff::drivers::Error { + name: driver.name.clone(), + attribute: "binary", + source: Box::new(err), + })?; + } + if let Some(command) = section.value(config::tree::Diff::DRIVER_COMMAND.name) { + driver.command = command.into_owned().into(); + } + if let Some(textconv) = section.value(config::tree::Diff::DRIVER_TEXTCONV.name) { + driver.binary_to_text_command = textconv.into_owned().into(); + } + if let Some(algorithm) = section.value("algorithm") { + driver.algorithm = config::tree::Diff::DRIVER_ALGORITHM + .try_into_algorithm(algorithm) + .or_else(|err| match err { + config::diff::algorithm::Error::Unimplemented { .. } if self.lenient_config => { + Ok(gix_diff::blob::Algorithm::Histogram) + } + err => Err(err), + }) + .with_lenient_default(self.lenient_config) + .map_err(|err| config::diff::drivers::Error { + name: driver.name.clone(), + attribute: "algorithm", + source: Box::new(err), + })? + .into(); + } + } + Ok(out) + } + + #[cfg(feature = "blob-diff")] + pub(crate) fn diff_pipeline_options( + &self, + ) -> Result { + Ok(gix_diff::blob::pipeline::Options { + large_file_threshold_bytes: self.big_file_threshold()?, + fs: self.fs_capabilities()?, + }) + } + + #[cfg(feature = "blob-diff")] + pub(crate) fn diff_renames(&self) -> Result, crate::diff::new_rewrites::Error> { + self.diff_renames + .get_or_try_init(|| crate::diff::new_rewrites(&self.resolved, self.lenient_config)) + .copied() + } + + #[cfg(feature = "blob-diff")] + pub(crate) fn big_file_threshold(&self) -> Result { + Ok(self + .resolved + .integer_by_key("core.bigFileThreshold") + .map(|number| Core::BIG_FILE_THRESHOLD.try_into_u64(number)) + .transpose() + .with_leniency(self.lenient_config)? + .unwrap_or(512 * 1024 * 1024)) + } + /// Returns a user agent for use with servers. #[cfg(any(feature = "async-network-client", feature = "blocking-network-client"))] pub(crate) fn user_agent_tuple(&self) -> (&'static str, Option>) { @@ -92,13 +183,6 @@ impl Cache { }) } - #[cfg(feature = "blob-diff")] - pub(crate) fn diff_renames(&self) -> Result, crate::diff::new_rewrites::Error> { - self.diff_renames - .get_or_try_init(|| crate::diff::new_rewrites(&self.resolved, self.lenient_config)) - .copied() - } - /// Returns (file-timeout, pack-refs timeout) pub(crate) fn lock_timeout( &self, @@ -196,12 +280,8 @@ impl Cache { )?; let capabilities = self.fs_capabilities()?; let filters = { - let collection = Default::default(); - let mut filters = gix_filter::Pipeline::new( - &collection, - repo.command_context()?, - crate::filter::Pipeline::options(repo)?, - ); + let mut filters = + gix_filter::Pipeline::new(repo.command_context()?, crate::filter::Pipeline::options(repo)?); if let Ok(mut head) = repo.head() { let ctx = filters.driver_context_mut(); ctx.ref_name = head.referent_name().map(|name| name.as_bstr().to_owned()); diff --git a/gix/src/config/cache/init.rs b/gix/src/config/cache/init.rs index 73fb256092d..444ae89fb0e 100644 --- a/gix/src/config/cache/init.rs +++ b/gix/src/config/cache/init.rs @@ -539,6 +539,16 @@ fn apply_environment_overrides( (env(key), key.name) }], ), + #[cfg(feature = "blob-diff")] + ( + "diff", + None, + git_prefix, + &[{ + let key = &config::tree::Diff::EXTERNAL; + (env(key), key.name) + }], + ), ] { let mut section = env_override .new_section(section_name, subsection_name) diff --git a/gix/src/config/mod.rs b/gix/src/config/mod.rs index 33c97fac687..2db9e1e6bd3 100644 --- a/gix/src/config/mod.rs +++ b/gix/src/config/mod.rs @@ -122,6 +122,36 @@ pub mod diff { Unimplemented { name: BString }, } } + + /// + pub mod pipeline_options { + /// The error produced when obtaining options needed to fill in [gix_diff::blob::pipeline::Options]. + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error(transparent)] + FilesystemCapabilities(#[from] crate::config::boolean::Error), + #[error(transparent)] + BigFileThreshold(#[from] crate::config::unsigned_integer::Error), + } + } + + /// + pub mod drivers { + use crate::bstr::BString; + + /// The error produced when obtaining a list of [Drivers](gix_diff::blob::Driver). + #[derive(Debug, thiserror::Error)] + #[error("Failed to parse value of 'diff.{name}.{attribute}'")] + pub struct Error { + /// The name fo the driver. + pub name: BString, + /// The name of the attribute we tried to parse. + pub attribute: &'static str, + /// The actual error that occurred. + pub source: Box, + } + } } /// diff --git a/gix/src/config/tree/sections/core.rs b/gix/src/config/tree/sections/core.rs index 2ec5c279ea3..50df2a77d9c 100644 --- a/gix/src/config/tree/sections/core.rs +++ b/gix/src/config/tree/sections/core.rs @@ -8,6 +8,9 @@ impl Core { pub const ABBREV: Abbrev = Abbrev::new_with_validate("abbrev", &config::Tree::CORE, validate::Abbrev); /// The `core.bare` key. pub const BARE: keys::Boolean = keys::Boolean::new_boolean("bare", &config::Tree::CORE); + /// The `core.bigFileThreshold` key. + pub const BIG_FILE_THRESHOLD: keys::UnsignedInteger = + keys::UnsignedInteger::new_unsigned_integer("bigFileThreshold", &config::Tree::CORE); /// The `core.checkStat` key. pub const CHECK_STAT: CheckStat = CheckStat::new_with_validate("checkStat", &config::Tree::CORE, validate::CheckStat); @@ -95,6 +98,7 @@ impl Section for Core { &[ &Self::ABBREV, &Self::BARE, + &Self::BIG_FILE_THRESHOLD, &Self::CHECK_STAT, &Self::DELTA_BASE_CACHE_LIMIT, &Self::DISAMBIGUATE, diff --git a/gix/src/config/tree/sections/diff.rs b/gix/src/config/tree/sections/diff.rs index 7c467b8f523..c8bf2249326 100644 --- a/gix/src/config/tree/sections/diff.rs +++ b/gix/src/config/tree/sections/diff.rs @@ -1,3 +1,4 @@ +use crate::config::tree::SubSectionRequirement; use crate::{ config, config::tree::{keys, Diff, Key, Section}, @@ -17,6 +18,24 @@ impl Diff { ); /// The `diff.renames` key. pub const RENAMES: Renames = Renames::new_renames("renames", &config::Tree::DIFF); + + /// The `diff..command` key. + pub const DRIVER_COMMAND: keys::String = keys::String::new_string("command", &config::Tree::DIFF) + .with_subsection_requirement(Some(SubSectionRequirement::Parameter("driver"))); + /// The `diff..textconv` key. + pub const DRIVER_TEXTCONV: keys::String = keys::String::new_string("textconv", &config::Tree::DIFF) + .with_subsection_requirement(Some(SubSectionRequirement::Parameter("driver"))); + /// The `diff..algorithm` key. + pub const DRIVER_ALGORITHM: Algorithm = + Algorithm::new_with_validate("algorithm", &config::Tree::DIFF, validate::Algorithm) + .with_subsection_requirement(Some(SubSectionRequirement::Parameter("driver"))); + /// The `diff..binary` key. + pub const DRIVER_BINARY: Binary = Binary::new_with_validate("binary", &config::Tree::DIFF, validate::Binary) + .with_subsection_requirement(Some(SubSectionRequirement::Parameter("driver"))); + + /// The `diff.external` key. + pub const EXTERNAL: keys::Program = + keys::Program::new_program("external", &config::Tree::DIFF).with_environment_override("GIT_EXTERNAL_DIFF"); } impl Section for Diff { @@ -25,7 +44,16 @@ impl Section for Diff { } fn keys(&self) -> &[&dyn Key] { - &[&Self::ALGORITHM, &Self::RENAME_LIMIT, &Self::RENAMES] + &[ + &Self::ALGORITHM, + &Self::RENAME_LIMIT, + &Self::RENAMES, + &Self::DRIVER_COMMAND, + &Self::DRIVER_TEXTCONV, + &Self::DRIVER_ALGORITHM, + &Self::DRIVER_BINARY, + &Self::EXTERNAL, + ] } } @@ -35,6 +63,9 @@ pub type Algorithm = keys::Any; /// The `diff.renames` key. pub type Renames = keys::Any; +/// The `diff..binary` key. +pub type Binary = keys::Any; + mod algorithm { use std::borrow::Cow; @@ -67,6 +98,38 @@ mod algorithm { } } +mod binary { + use crate::config::tree::diff::Binary; + + impl Binary { + /// Convert `value` into a tri-state boolean that can take the special value `auto`, resulting in `None`, or is a boolean. + /// If `None` is given, it's treated as implicit boolean `true`, as this method is made to be used + /// with [`gix_config::file::section::Body::value_implicit()`]. + pub fn try_into_binary( + &'static self, + value: Option>, + ) -> Result, crate::config::key::GenericErrorWithValue> { + Ok(match value { + None => Some(true), + Some(value) => { + if value.as_ref() == "auto" { + None + } else { + Some( + gix_config::Boolean::try_from(value.as_ref()) + .map(|b| b.0) + .map_err(|err| { + crate::config::key::GenericErrorWithValue::from_value(self, value.into_owned()) + .with_source(err) + })?, + ) + } + } + }) + } + } +} + mod renames { use crate::{ bstr::ByteSlice, @@ -125,4 +188,12 @@ mod validate { Ok(()) } } + + pub struct Binary; + impl keys::Validate for Binary { + fn validate(&self, value: &BStr) -> Result<(), Box> { + Diff::DRIVER_BINARY.try_into_binary(Some(value.into()))?; + Ok(()) + } + } } diff --git a/gix/src/diff.rs b/gix/src/diff.rs index 445698cea39..315ef6cd791 100644 --- a/gix/src/diff.rs +++ b/gix/src/diff.rs @@ -22,6 +22,7 @@ mod utils { use crate::config::cache::util::ApplyLeniency; use crate::config::tree::Diff; use crate::diff::rename::Tracking; + use crate::Repository; use gix_diff::rewrites::Copies; use gix_diff::Rewrites; @@ -38,6 +39,27 @@ mod utils { } } + /// + pub mod resource_cache { + /// The error returned by [`resource_cache()`](super::resource_cache()). + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error(transparent)] + DiffAlgorithm(#[from] crate::config::diff::algorithm::Error), + #[error(transparent)] + WorktreeFilterOptions(#[from] crate::filter::pipeline::options::Error), + #[error(transparent)] + DiffDrivers(#[from] crate::config::diff::drivers::Error), + #[error(transparent)] + DiffPipelineOptions(#[from] crate::config::diff::pipeline_options::Error), + #[error(transparent)] + CommandContext(#[from] crate::config::command_context::Error), + #[error(transparent)] + AttributeStack(#[from] crate::config::attribute_stack::Error), + } + } + /// Create an instance by reading all relevant information from the `config`uration, while being `lenient` or not. /// Returns `Ok(None)` if nothing is configured. /// @@ -75,6 +97,47 @@ mod utils { } .into()) } + + /// Return a low-level utility to efficiently prepare a the blob-level diff operation between two resources, + /// and cache these diffable versions so that matrix-like MxN diffs are efficient. + /// + /// `repo` is used to obtain the needed configuration values, and `index` is used to potentially read `.gitattributes` + /// files from which may affect the diff operation. + /// `mode` determines how the diffable files will look like, and also how fast, in average, these conversions are. + /// `attribute_source` controls where `.gitattributes` will be read from, and it's typically adjusted based on the + /// `roots` - if there are no worktree roots, `.gitattributes` are also not usually read from worktrees. + /// `roots` provide information about where to get diffable data from, so source and destination can either be sourced from + /// a worktree, or from the object database, or both. + pub fn resource_cache( + repo: &Repository, + index: &gix_index::State, + mode: gix_diff::blob::pipeline::Mode, + attribute_source: gix_worktree::stack::state::attributes::Source, + roots: gix_diff::blob::pipeline::WorktreeRoots, + ) -> Result { + let diff_algo = repo.config.diff_algorithm()?; + let diff_cache = gix_diff::blob::Platform::new( + gix_diff::blob::platform::Options { + algorithm: Some(diff_algo), + skip_internal_diff_if_external_is_configured: false, + }, + gix_diff::blob::Pipeline::new( + roots, + gix_filter::Pipeline::new(repo.command_context()?, crate::filter::Pipeline::options(repo)?), + repo.config.diff_drivers()?, + repo.config.diff_pipeline_options()?, + ), + mode, + repo.attributes_only( + // TODO(perf): this could benefit from not having to build an intermediate index, + // and traverse the a tree directly. + index, + attribute_source, + )? + .inner, + ); + Ok(diff_cache) + } } #[cfg(feature = "blob-diff")] -pub use utils::new_rewrites; +pub use utils::{new_rewrites, resource_cache}; diff --git a/gix/src/filter.rs b/gix/src/filter.rs index d18fc06fa57..f05e332a255 100644 --- a/gix/src/filter.rs +++ b/gix/src/filter.rs @@ -113,11 +113,7 @@ impl<'repo> Pipeline<'repo> { /// Create a new instance by extracting all necessary information and configuration from a `repo` along with `cache` for accessing /// attributes. The `index` is used for some filters which may access it under very specific circumstances. pub fn new(repo: &'repo Repository, cache: gix_worktree::Stack) -> Result { - let pipeline = gix_filter::Pipeline::new( - cache.attributes_collection(), - repo.command_context()?, - Self::options(repo)?, - ); + let pipeline = gix_filter::Pipeline::new(repo.command_context()?, Self::options(repo)?); Ok(Pipeline { inner: pipeline, cache, diff --git a/gix/src/object/blob.rs b/gix/src/object/blob.rs index ef19c302a62..56644e5b72e 100644 --- a/gix/src/object/blob.rs +++ b/gix/src/object/blob.rs @@ -3,63 +3,132 @@ use crate::{Blob, ObjectDetached}; /// #[cfg(feature = "blob-diff")] pub mod diff { + use gix_diff::blob::platform::prepare_diff::Operation; + use gix_diff::blob::ResourceKind; use std::ops::Range; - use crate::{bstr::ByteSlice, object::blob::diff::line::Change}; + use crate::object::tree::diff::change::Event; + use crate::{bstr::ByteSlice, object::blob::diff::lines::Change}; /// A platform to keep temporary information to perform line diffs on modified blobs. /// - pub struct Platform<'old, 'new> { - /// The previous version of the blob. - pub old: crate::Object<'old>, - /// The new version of the blob. - pub new: crate::Object<'new>, - /// The algorithm to use when calling [imara_diff::diff()][gix_diff::blob::diff()]. - /// This value is determined by the `diff.algorithm` configuration. - pub algo: gix_diff::blob::Algorithm, + pub struct Platform<'a> { + /// The cache holding diffable data related to our blobs. + pub resource_cache: &'a mut gix_diff::blob::Platform, } /// pub mod init { - /// The error returned by [`Platform::from_ids()`][super::Platform::from_ids()]. - #[derive(Debug, thiserror::Error)] - #[allow(missing_docs)] - pub enum Error { - #[error("Could not find the previous blob or the new blob to diff against")] - FindExisting(#[from] crate::object::find::existing::Error), - #[error("Could not obtain diff algorithm from configuration")] - DiffAlgorithm(#[from] crate::config::diff::algorithm::Error), - } + /// The error returned by [`Platform::from_tree_change()`][super::Platform::from_tree_change()]. + pub type Error = gix_diff::blob::platform::set_resource::Error; } - impl<'old, 'new> Platform<'old, 'new> { - /// Produce a platform for performing various diffs after obtaining the object data of `previous_id` and `new_id`. - /// - /// Note that these objects are treated as raw data and are assumed to be blobs. - pub fn from_ids( - previous_id: &crate::Id<'old>, - new_id: &crate::Id<'new>, - ) -> Result, init::Error> { - match previous_id - .object() - .and_then(|old| new_id.object().map(|new| (old, new))) - { - Ok((old, new)) => { - let algo = match new_id.repo.config.diff_algorithm() { - Ok(algo) => algo, - Err(err) => return Err(err.into()), - }; - Ok(Platform { old, new, algo }) + impl<'a> Platform<'a> { + /// Produce a platform for performing various diffs after obtaining the data from a single `tree_change`. + pub fn from_tree_change( + tree_change: &crate::object::tree::diff::Change<'_, '_, '_>, + resource_cache: &'a mut gix_diff::blob::Platform, + ) -> Result, init::Error> { + match tree_change.event { + Event::Addition { entry_mode, id } => { + resource_cache.set_resource( + id.repo.object_hash().null(), + entry_mode.kind(), + tree_change.location, + ResourceKind::OldOrSource, + &id.repo.objects, + )?; + resource_cache.set_resource( + id.inner, + entry_mode.kind(), + tree_change.location, + ResourceKind::NewOrDestination, + &id.repo.objects, + )?; + } + Event::Deletion { entry_mode, id } => { + resource_cache.set_resource( + id.inner, + entry_mode.kind(), + tree_change.location, + ResourceKind::OldOrSource, + &id.repo.objects, + )?; + resource_cache.set_resource( + id.repo.object_hash().null(), + entry_mode.kind(), + tree_change.location, + ResourceKind::NewOrDestination, + &id.repo.objects, + )?; + } + Event::Modification { + previous_entry_mode, + previous_id, + entry_mode, + id, + } => { + resource_cache.set_resource( + previous_id.inner, + previous_entry_mode.kind(), + tree_change.location, + ResourceKind::OldOrSource, + &previous_id.repo.objects, + )?; + resource_cache.set_resource( + id.inner, + entry_mode.kind(), + tree_change.location, + ResourceKind::NewOrDestination, + &id.repo.objects, + )?; + } + Event::Rewrite { + source_location, + source_entry_mode, + source_id, + entry_mode, + id, + diff: _, + copy: _, + } => { + resource_cache.set_resource( + source_id.inner, + source_entry_mode.kind(), + source_location, + ResourceKind::OldOrSource, + &source_id.repo.objects, + )?; + resource_cache.set_resource( + id.inner, + entry_mode.kind(), + tree_change.location, + ResourceKind::NewOrDestination, + &id.repo.objects, + )?; } - Err(err) => Err(err.into()), } + Ok(Self { resource_cache }) } } /// - pub mod line { + pub mod lines { use crate::bstr::BStr; + /// The error returned by [Platform::lines()](super::Platform::lines()). + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error + where + E: std::error::Error + Send + Sync + 'static, + { + #[error(transparent)] + ProcessHunk(E), + #[error(transparent)] + PrepareDiff(#[from] gix_diff::blob::platform::prepare_diff::Error), + } + /// A change to a hunk of lines. pub enum Change<'a, 'data> { /// Lines were added. @@ -82,70 +151,91 @@ pub mod diff { } } - impl<'old, 'new> Platform<'old, 'new> { + impl<'a> Platform<'a> { /// Perform a diff on lines between the old and the new version of a blob, passing each hunk of lines to `process_hunk`. - /// The diffing algorithm is determined by the `diff.algorithm` configuration. - /// - /// Note that you can invoke the diff more flexibly as well. + /// The diffing algorithm is determined by the `diff.algorithm` configuration, or individual diff drivers. + /// Note that `process_hunk` is not called if one of the involved resources are binary, but that can be determined + /// by introspecting the outcome. // TODO: more tests (only tested insertion right now) - pub fn lines(&self, mut process_hunk: FnH) -> Result<(), E> + pub fn lines( + &mut self, + mut process_hunk: FnH, + ) -> Result, lines::Error> where - FnH: FnMut(line::Change<'_, '_>) -> Result<(), E>, - E: std::error::Error, + FnH: FnMut(lines::Change<'_, '_>) -> Result<(), E>, + E: std::error::Error + Send + Sync + 'static, { - let input = self.line_tokens(); - let mut err = None; - let mut lines = Vec::new(); - gix_diff::blob::diff(self.algo, &input, |before: Range, after: Range| { - if err.is_some() { - return; + self.resource_cache.options.skip_internal_diff_if_external_is_configured = false; + + let prep = self.resource_cache.prepare_diff()?; + match prep.operation { + Operation::InternalDiff { algorithm } => { + let input = prep.interned_input(); + let mut err = None; + let mut lines = Vec::new(); + + gix_diff::blob::diff(algorithm, &input, |before: Range, after: Range| { + if err.is_some() { + return; + } + lines.clear(); + lines.extend( + input.before[before.start as usize..before.end as usize] + .iter() + .map(|&line| input.interner[line].as_bstr()), + ); + let end_of_before = lines.len(); + lines.extend( + input.after[after.start as usize..after.end as usize] + .iter() + .map(|&line| input.interner[line].as_bstr()), + ); + let hunk_before = &lines[..end_of_before]; + let hunk_after = &lines[end_of_before..]; + if hunk_after.is_empty() { + err = process_hunk(Change::Deletion { lines: hunk_before }).err(); + } else if hunk_before.is_empty() { + err = process_hunk(Change::Addition { lines: hunk_after }).err(); + } else { + err = process_hunk(Change::Modification { + lines_before: hunk_before, + lines_after: hunk_after, + }) + .err(); + } + }); + + if let Some(err) = err { + return Err(lines::Error::ProcessHunk(err)); + } } - lines.clear(); - lines.extend( - input.before[before.start as usize..before.end as usize] - .iter() - .map(|&line| input.interner[line].as_bstr()), - ); - let end_of_before = lines.len(); - lines.extend( - input.after[after.start as usize..after.end as usize] - .iter() - .map(|&line| input.interner[line].as_bstr()), - ); - let hunk_before = &lines[..end_of_before]; - let hunk_after = &lines[end_of_before..]; - if hunk_after.is_empty() { - err = process_hunk(Change::Deletion { lines: hunk_before }).err(); - } else if hunk_before.is_empty() { - err = process_hunk(Change::Addition { lines: hunk_after }).err(); - } else { - err = process_hunk(Change::Modification { - lines_before: hunk_before, - lines_after: hunk_after, - }) - .err(); + Operation::ExternalCommand { .. } => { + unreachable!("we disabled that") } - }); - - match err { - Some(err) => Err(err), - None => Ok(()), - } + Operation::SourceOrDestinationIsBinary => {} + }; + Ok(prep) } /// Count the amount of removed and inserted lines efficiently. - pub fn line_counts(&self) -> gix_diff::blob::sink::Counter<()> { - let tokens = self.line_tokens(); - gix_diff::blob::diff(self.algo, &tokens, gix_diff::blob::sink::Counter::default()) - } + /// Note that nothing will happen if one of the inputs is binary, and `None` will be returned. + pub fn line_counts( + &mut self, + ) -> Result>, gix_diff::blob::platform::prepare_diff::Error> { + self.resource_cache.options.skip_internal_diff_if_external_is_configured = false; - /// Return a tokenizer which treats lines as smallest unit for use in a [diff operation][gix_diff::blob::diff()]. - /// - /// The line separator is determined according to normal git rules and filters. - pub fn line_tokens(&self) -> gix_diff::blob::intern::InternedInput<&[u8]> { - // TODO: make use of `core.eol` and/or filters to do line-counting correctly. It's probably - // OK to just know how these objects are saved to know what constitutes a line. - gix_diff::blob::intern::InternedInput::new(self.old.data.as_bytes(), self.new.data.as_bytes()) + let prep = self.resource_cache.prepare_diff()?; + match prep.operation { + Operation::InternalDiff { algorithm } => { + let tokens = prep.interned_input(); + let counter = gix_diff::blob::diff(algorithm, &tokens, gix_diff::blob::sink::Counter::default()); + Ok(Some(counter)) + } + Operation::ExternalCommand { .. } => { + unreachable!("we disabled that") + } + Operation::SourceOrDestinationIsBinary => Ok(None), + } } } } diff --git a/gix/src/object/tree/diff/change.rs b/gix/src/object/tree/diff/change.rs index a95770d6656..4e2bd2eff65 100644 --- a/gix/src/object/tree/diff/change.rs +++ b/gix/src/object/tree/diff/change.rs @@ -68,25 +68,25 @@ pub enum Event<'a, 'old, 'new> { }, } -impl<'a, 'old, 'new> Event<'a, 'old, 'new> { - /// Produce a platform for performing a line-diff, or `None` if this is not a [`Modification`][Event::Modification] - /// or one of the entries to compare is not a blob. - pub fn diff( +impl<'a, 'old, 'new> super::Change<'a, 'old, 'new> { + /// Produce a platform for performing a line-diff no matter whether the underlying [Event] is an addition, modification, + /// deletion or rewrite. + /// Use `resource_cache` to store the diffable data and possibly reuse previously stored data. + /// Afterwards the platform, which holds on to `resource_cache`, can be used to perform ready-made operations on the + /// pre-set resources. + /// + /// ### Warning about Memory Consumption + /// + /// `resource_cache` only grows, so one should call [`gix_diff::blob::Platform::clear_resource_cache`] occasionally. + pub fn diff<'b>( &self, - ) -> Option, crate::object::blob::diff::init::Error>> { - match self { - Event::Modification { - previous_entry_mode, - previous_id, - entry_mode, - id, - } if entry_mode.is_blob() && previous_entry_mode.is_blob() => { - Some(crate::object::blob::diff::Platform::from_ids(previous_id, id)) - } - _ => None, - } + resource_cache: &'b mut gix_diff::blob::Platform, + ) -> Result, crate::object::blob::diff::init::Error> { + crate::object::blob::diff::Platform::from_tree_change(self, resource_cache) } +} +impl<'a, 'old, 'new> Event<'a, 'old, 'new> { /// Return the current mode of this instance. pub fn entry_mode(&self) -> gix_object::tree::EntryMode { match self { diff --git a/gix/src/object/tree/diff/for_each.rs b/gix/src/object/tree/diff/for_each.rs index 404e804327d..86cddb528b6 100644 --- a/gix/src/object/tree/diff/for_each.rs +++ b/gix/src/object/tree/diff/for_each.rs @@ -12,8 +12,8 @@ pub enum Error { Diff(#[from] gix_diff::tree::changes::Error), #[error("The user-provided callback failed")] ForEach(#[source] Box), - #[error("Could not configure diff algorithm prior to checking similarity")] - ConfigureDiffAlgorithm(#[from] crate::config::diff::algorithm::Error), + #[error(transparent)] + ResourceCache(#[from] crate::repository::diff::resource_cache::Error), #[error("Failure during rename tracking")] RenameTracking(#[from] tracker::emit::Error), } @@ -36,18 +36,51 @@ impl<'a, 'old> Platform<'a, 'old> { other: &Tree<'new>, for_each: impl FnMut(Change<'_, 'old, 'new>) -> Result, ) -> Result + where + E: std::error::Error + Sync + Send + 'static, + { + self.for_each_to_obtain_tree_inner(other, for_each, None) + } + + /// Like [`Self::for_each_to_obtain_tree()`], but with a reusable `resource_cache` which is used to perform + /// diffs fast. + /// + /// Reusing it between multiple invocations saves a lot of IOps as it avoids the creation + /// of a temporary `resource_cache` that triggers reading or checking for multiple gitattribute files. + /// Note that it's recommended to call [`gix_diff::blob::Platform::clear_resource_cache()`] between the calls + /// to avoid runaway memory usage, as the cache isn't limited. + /// + /// Note that to do rename tracking like `git` does, one has to configure the `resource_cache` with + /// a conversion pipeline that uses [`gix_diff::blob::pipeline::Mode::ToGit`]. + pub fn for_each_to_obtain_tree_with_cache<'new, E>( + &mut self, + other: &Tree<'new>, + resource_cache: &mut gix_diff::blob::Platform, + for_each: impl FnMut(Change<'_, 'old, 'new>) -> Result, + ) -> Result + where + E: std::error::Error + Sync + Send + 'static, + { + self.for_each_to_obtain_tree_inner(other, for_each, Some(resource_cache)) + } + + fn for_each_to_obtain_tree_inner<'new, E>( + &mut self, + other: &Tree<'new>, + for_each: impl FnMut(Change<'_, 'old, 'new>) -> Result, + resource_cache: Option<&mut gix_diff::blob::Platform>, + ) -> Result where E: std::error::Error + Sync + Send + 'static, { let repo = self.lhs.repo; - let diff_algo = repo.config.diff_algorithm()?; let mut delegate = Delegate { src_tree: self.lhs, other_repo: other.repo, recorder: gix_diff::tree::Recorder::default().track_location(self.tracking), visit: for_each, location: self.tracking, - tracked: self.rewrites.map(|r| rewrites::Tracker::new(r, diff_algo)), + tracked: self.rewrites.map(rewrites::Tracker::new), err: None, }; match gix_diff::tree::Changes::from(TreeRefIter::from_bytes(&self.lhs.data)).needed_to_obtain( @@ -58,7 +91,7 @@ impl<'a, 'old> Platform<'a, 'old> { ) { Ok(()) => { let outcome = Outcome { - rewrites: delegate.process_tracked_changes()?, + rewrites: delegate.process_tracked_changes(resource_cache)?, }; match delegate.err { Some(err) => Err(Error::ForEach(Box::new(err))), @@ -131,12 +164,25 @@ where } } - fn process_tracked_changes(&mut self) -> Result, Error> { + fn process_tracked_changes( + &mut self, + diff_cache: Option<&mut gix_diff::blob::Platform>, + ) -> Result, Error> { let tracked = match self.tracked.as_mut() { Some(t) => t, None => return Ok(None), }; + let repo = self.src_tree.repo; + let mut storage; + let diff_cache = match diff_cache { + Some(cache) => cache, + None => { + storage = repo.diff_resource_cache(gix_diff::blob::pipeline::Mode::ToGit, Default::default())?; + &mut storage + } + }; + let outcome = tracked.emit( |dest, source| match source { Some(source) => { @@ -174,6 +220,7 @@ where &mut self.err, ), }, + diff_cache, &self.src_tree.repo.objects, |push| { self.src_tree diff --git a/gix/src/repository/diff.rs b/gix/src/repository/diff.rs new file mode 100644 index 00000000000..cb1d070a2f4 --- /dev/null +++ b/gix/src/repository/diff.rs @@ -0,0 +1,45 @@ +use crate::Repository; + +/// +pub mod resource_cache { + /// The error returned by [Repository::diff_resource_cache()](super::Repository::diff_resource_cache()). + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error("Could not obtain resource cache for diffing")] + ResourceCache(#[from] crate::diff::resource_cache::Error), + #[error(transparent)] + Index(#[from] crate::repository::index_or_load_from_head::Error), + } +} + +/// Diff-utilities +impl Repository { + /// Create a resource cache for diffable objects, and configured with everything it needs to know to perform diffs + /// faithfully just like `git` would. + /// `mode` controls what version of a resource should be diffed. + /// `worktree_roots` determine if files can be read from the worktree, where each side of the diff operation can + /// be represented by its own worktree root. `.gitattributes` are automatically read from the worktree if at least + /// one worktree is present. + /// + /// Note that attributes will always be obtained from the current `HEAD` index even if the resources being diffed + /// might live in another tree. Further, if one of the `worktree_roots` are set, attributes will also be read from + /// the worktree. Otherwise, it will be skipped and attributes are read from the index tree instead. + pub fn diff_resource_cache( + &self, + mode: gix_diff::blob::pipeline::Mode, + worktree_roots: gix_diff::blob::pipeline::WorktreeRoots, + ) -> Result { + Ok(crate::diff::resource_cache( + self, + &*self.index_or_load_from_head()?, + mode, + if worktree_roots.new_root.is_some() || worktree_roots.old_root.is_some() { + gix_worktree::stack::state::attributes::Source::WorktreeThenIdMapping + } else { + gix_worktree::stack::state::attributes::Source::IdMapping + }, + worktree_roots, + )?) + } +} diff --git a/gix/src/repository/filter.rs b/gix/src/repository/filter.rs index 3aacb1a3d2c..68644ca984e 100644 --- a/gix/src/repository/filter.rs +++ b/gix/src/repository/filter.rs @@ -2,7 +2,7 @@ use crate::{filter, repository::IndexPersistedOrInMemory, Id, Repository}; /// pub mod pipeline { - /// The error returned by [Repository::filter_pipeline()][super::Repository::filter_pipeline()]. + /// The error returned by [Repository::filter_pipeline()](super::Repository::filter_pipeline()). #[derive(Debug, thiserror::Error)] #[allow(missing_docs)] pub enum Error { @@ -24,7 +24,7 @@ pub mod pipeline { impl Repository { /// Configure a pipeline for converting byte buffers to the worktree representation, and byte streams to the git-internal /// representation. Also return the index that was used when initializing the pipeline as it may be useful when calling - /// [convert_to_git()][filter::Pipeline::convert_to_git()]. + /// [convert_to_git()](filter::Pipeline::convert_to_git()). /// Bare repositories will either use `HEAD^{tree}` for accessing all relevant worktree files or the given `tree_if_bare`. /// /// Note that this is considered a primitive as it operates on data directly and will not have permanent effects. diff --git a/gix/src/repository/mod.rs b/gix/src/repository/mod.rs index 6b9e284cc05..28aa5aa8e28 100644 --- a/gix/src/repository/mod.rs +++ b/gix/src/repository/mod.rs @@ -40,6 +40,9 @@ pub mod attributes; mod cache; mod config; /// +#[cfg(feature = "blob-diff")] +pub mod diff; +/// #[cfg(feature = "attributes")] pub mod filter; mod graph; diff --git a/gix/src/repository/worktree.rs b/gix/src/repository/worktree.rs index 198c1eb5e6e..529243896ea 100644 --- a/gix/src/repository/worktree.rs +++ b/gix/src/repository/worktree.rs @@ -79,11 +79,7 @@ impl crate::Repository { let mut cache = self .attributes_only(&index, gix_worktree::stack::state::attributes::Source::IdMapping)? .detach(); - let pipeline = gix_filter::Pipeline::new( - cache.attributes_collection(), - self.command_context()?, - crate::filter::Pipeline::options(self)?, - ); + let pipeline = gix_filter::Pipeline::new(self.command_context()?, crate::filter::Pipeline::options(self)?); let objects = self.objects.clone().into_arc().expect("TBD error handling"); let stream = gix_worktree_stream::from_tree( id, diff --git a/gix/tests/config/tree.rs b/gix/tests/config/tree.rs index fd844f8b0f0..9ddb8d0470e 100644 --- a/gix/tests/config/tree.rs +++ b/gix/tests/config/tree.rs @@ -243,6 +243,37 @@ mod diff { Ok(()) } + #[test] + fn driver_binary() -> crate::Result { + assert_eq!( + Diff::DRIVER_BINARY.try_into_binary(Some(bcow("auto")))?, + None, + "this is as good as not setting it, but it's a valid value that would fail if it was just a boolean. It's undocumented though…" + ); + assert!(Diff::DRIVER_BINARY.validate("auto".into()).is_ok()); + + for (actual, expected) in [ + (Some("true"), Some(true)), + (Some("false"), Some(false)), + (None, Some(true)), + ] { + assert_eq!(Diff::DRIVER_BINARY.try_into_binary(actual.map(bcow))?, expected); + if let Some(value) = actual { + assert!(Diff::DRIVER_BINARY.validate(value.into()).is_ok()); + } + } + + assert_eq!( + Diff::DRIVER_BINARY + .try_into_binary(Some(bcow("something"))) + .unwrap_err() + .to_string(), + "The key \"diff..binary=something\" was invalid", + ); + assert!(Diff::DRIVER_BINARY.validate("foo".into()).is_err()); + Ok(()) + } + #[test] fn algorithm() -> crate::Result { for (actual, expected) in [ diff --git a/gix/tests/diff/mod.rs b/gix/tests/diff/mod.rs new file mode 100644 index 00000000000..d6087a50209 --- /dev/null +++ b/gix/tests/diff/mod.rs @@ -0,0 +1,47 @@ +use crate::util::named_repo; +use gix_diff::blob::{Algorithm, Driver}; + +#[test] +fn resource_cache() -> crate::Result { + let repo = named_repo("make_diff_repo.sh")?; + let cache = gix::diff::resource_cache( + &repo, + &*repo.index()?, + gix::diff::blob::pipeline::Mode::ToWorktreeAndBinaryToText, + gix_worktree::stack::state::attributes::Source::IdMapping, + Default::default(), + )?; + assert_eq!( + cache.filter.drivers(), + &[ + Driver { + name: "all-but-binary".into(), + command: Some("command".into()), + algorithm: Some(Algorithm::Histogram), + binary_to_text_command: Some("textconv".into()), + is_binary: None + }, + Driver { + name: "binary-false".into(), + is_binary: Some(false), + ..Default::default() + }, + Driver { + name: "binary-true".into(), + is_binary: Some(true), + ..Default::default() + } + ] + ); + assert_eq!(cache.options.algorithm, Some(Algorithm::Histogram)); + assert!( + !cache.options.skip_internal_diff_if_external_is_configured, + "pre-set to something that makes sense for most" + ); + assert_eq!( + cache.filter.options.large_file_threshold_bytes, + 512 * 1024 * 1024, + "the default value unless it's configured" + ); + Ok(()) +} diff --git a/gix/tests/fixtures/generated-archives/make_diff_repo.tar.xz b/gix/tests/fixtures/generated-archives/make_diff_repo.tar.xz index 0d6b76452b7..d4b13341e89 100644 --- a/gix/tests/fixtures/generated-archives/make_diff_repo.tar.xz +++ b/gix/tests/fixtures/generated-archives/make_diff_repo.tar.xz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dceeedc3ee706632030f8317f2245c327456ec58a6ac690d09099557a5aa23b0 -size 32336 +oid sha256:66f54895c75e2eebaad514224242522c19e2b16b12a48f22a059b438745e0327 +size 32444 diff --git a/gix/tests/fixtures/make_diff_repo.sh b/gix/tests/fixtures/make_diff_repo.sh index 1aac40b43a7..1175cf4ccd9 100755 --- a/gix/tests/fixtures/make_diff_repo.sh +++ b/gix/tests/fixtures/make_diff_repo.sh @@ -3,6 +3,24 @@ set -eu -o pipefail git init -q +cat <>.git/config + +[diff "binary-true"] + binary = true +[diff "binary-false"] + binary = false +[diff ""] + command = "empty is ignored" +[diff] + command = "this is also ignored as sub-section name is missing" + algorithm = histogram +[diff "all-but-binary"] + command = command + textconv = textconv + algorithm = histogram + binary = auto +EOF + git checkout -b main mkdir dir touch a b dir/c d diff --git a/gix/tests/gix-init.rs b/gix/tests/gix-init.rs index b6a51c5e1f7..917c62bbac1 100644 --- a/gix/tests/gix-init.rs +++ b/gix/tests/gix-init.rs @@ -51,7 +51,8 @@ mod with_overrides { .set("GIT_ICASE_PATHSPECS", "pathspecs-icase") .set("GIT_TERMINAL_PROMPT", "42") .set("GIT_SHALLOW_FILE", "shallow-file-env") - .set("GIT_NAMESPACE", "namespace-env"); + .set("GIT_NAMESPACE", "namespace-env") + .set("GIT_EXTERNAL_DIFF", "external-diff-env"); let mut opts = gix::open::Options::isolated() .cli_overrides([ "http.userAgent=agent-from-cli", @@ -236,6 +237,8 @@ mod with_overrides { ("gitoxide.http.verbose", "true"), ("gitoxide.allow.protocolFromUser", "file-allowed"), ("core.useReplaceRefs", "no-replace"), + #[cfg(feature = "blob-diff")] + ("diff.external", "external-diff-env"), ("gitoxide.objects.replaceRefBase", "refs/replace-mine"), ("gitoxide.committer.nameFallback", "committer name"), ("gitoxide.committer.emailFallback", "committer email"), diff --git a/gix/tests/gix.rs b/gix/tests/gix.rs index 4721e0d4eb2..81008faca89 100644 --- a/gix/tests/gix.rs +++ b/gix/tests/gix.rs @@ -4,6 +4,8 @@ use util::*; mod clone; mod commit; mod config; +#[cfg(feature = "blob-diff")] +mod diff; mod head; mod id; mod init; diff --git a/gix/tests/object/tree/diff.rs b/gix/tests/object/tree/diff.rs index c3304965b3f..867b6949ddf 100644 --- a/gix/tests/object/tree/diff.rs +++ b/gix/tests/object/tree/diff.rs @@ -2,7 +2,7 @@ use std::convert::Infallible; use gix::{ bstr::BString, - object::{blob::diff::line::Change, tree::diff::change::Event}, + object::{blob::diff::lines::Change, tree::diff::change::Event}, }; use gix_object::bstr::ByteSlice; use gix_object::tree::EntryKind; @@ -14,6 +14,7 @@ fn changes_against_tree_modified() -> crate::Result { let repo = named_repo("make_diff_repo.sh")?; let from = tree_named(&repo, "@^{/c3-modification}~1"); let to = tree_named(&repo, ":/c3-modification"); + let mut cache = repo.diff_resource_cache(gix_diff::blob::pipeline::Mode::ToGit, Default::default())?; from.changes()? .for_each_to_obtain_tree(&to, |change| -> Result<_, Infallible> { assert_eq!(change.location, "", "without configuration the location field is empty"); @@ -34,14 +35,14 @@ fn changes_against_tree_modified() -> crate::Result { } }; - let diff = change.event.diff().expect("changed file").expect("objects available"); - let count = diff.line_counts(); + let mut diff = change.diff(&mut cache).expect("objects available"); + let count = diff.line_counts().expect("no diff error").expect("no binary blobs"); assert_eq!(count.insertions, 1); assert_eq!(count.removals, 0); diff.lines(|hunk| { match hunk { Change::Deletion { .. } => unreachable!("there was no deletion"), - Change::Addition { lines } => assert_eq!(lines, vec!["a1".as_bytes().as_bstr()]), + Change::Addition { lines } => assert_eq!(lines, vec!["a1\n".as_bytes().as_bstr()]), Change::Modification { .. } => unreachable!("there was no modification"), }; Ok::<_, Infallible>(()) @@ -420,6 +421,7 @@ mod track_rewrites { insertions: 1, before: 11, after: 12, + similarity: 0.8888889 }), "by similarity there is a diff" ); @@ -529,6 +531,7 @@ mod track_rewrites { insertions: 3, before: 12, after: 15, + similarity: 0.75 }), "by similarity there is a diff" ); @@ -589,12 +592,12 @@ mod track_rewrites { let out = out.rewrites.expect("tracking enabled"); assert_eq!(stat, None, "similarity can't run"); - assert_eq!(out.num_similarity_checks, 3); + assert_eq!(out.num_similarity_checks, 0); assert_eq!( out.num_similarity_checks_skipped_for_rename_tracking_due_to_limit, 0, "no limit configured" ); - assert_eq!(out.num_similarity_checks_skipped_for_copy_tracking_due_to_limit, 57); + assert_eq!(out.num_similarity_checks_skipped_for_copy_tracking_due_to_limit, 19); Ok(()) } @@ -645,7 +648,7 @@ mod track_rewrites { let out = out.rewrites.expect("tracking enabled"); assert_eq!(out.num_similarity_checks, 0); assert_eq!(out.num_similarity_checks_skipped_for_rename_tracking_due_to_limit, 0); - assert_eq!(out.num_similarity_checks_skipped_for_copy_tracking_due_to_limit, 3); + assert_eq!(out.num_similarity_checks_skipped_for_copy_tracking_due_to_limit, 2); Ok(()) } diff --git a/src/plumbing/progress.rs b/src/plumbing/progress.rs index 5a9f033ec55..7059da02060 100644 --- a/src/plumbing/progress.rs +++ b/src/plumbing/progress.rs @@ -124,10 +124,6 @@ static GIT_CONFIG: &[Record] = &[ config: "core.alternateRefsPrefixes", usage: NotPlanned { reason: "seems like a niche feature, but can be implemented if there is demand" } }, - Record { - config: "core.bigFileThreshold", - usage: Planned { note: Some("unfortunately we can't stream packed files yet, even if not delta-compressed, but respecting the threshold for other operations is definitely a must") } - }, Record { config: "core.compression", usage: Planned { note: Some("Allow to remove similar hardcoded value - passing it through will be some effort") }, @@ -500,17 +496,9 @@ static GIT_CONFIG: &[Record] = &[ config: "transfer.credentialsInUrl", usage: Planned { note: Some("currently we are likely to expose passwords in errors or in other places, and it's better to by default not do that") } }, - Record { - config: "diff.*.textconv", - usage: Planned { note: None } - }, Record { config: "diff.*.cachetextconv", - usage: Planned { note: None } - }, - Record { - config: "diff.*.command", - usage: Planned { note: None } + usage: NotPlanned {reason: "It seems to slow to do that, and persisting results to save a relatively cheap computation doesn't seem right"} }, ];