Skip to content

Commit

Permalink
feat!: object::blob::diff::Platform now performs all necessary conv…
Browse files Browse the repository at this point in the history
…ersions.

Previously it would just offer the git-ODB version of a blob for diffing,
while it will now make it possible to apply all necessary conversion steps
for you.

This also moves `Event::diff()` to `Change::diff()`, adds
`Repository::diff_resource_cache()` and refactors nearly everything
about the `objects::blob::diff::Platform`.
  • Loading branch information
Byron committed Dec 2, 2023
1 parent 406acef commit 4743212
Show file tree
Hide file tree
Showing 9 changed files with 260 additions and 129 deletions.
7 changes: 4 additions & 3 deletions gix/src/diff.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,15 @@ mod utils {
/// `repo` is used to obtain the needed configuration values, and `index` is used to potentially read `.gitattributes`
/// files from which may affect the diff operation.
/// `mode` determines how the diffable files will look like, and also how fast, in average, these conversions are.
/// `attribute_source` controls where `.gitattributes` will be read from, and it's typically adjusted based on the
/// `roots` - if there are no worktree roots, `.gitattributes` are also not usually read from worktrees.
/// `roots` provide information about where to get diffable data from, so source and destination can either be sourced from
/// a worktree, or from the object database, or both.
pub fn resource_cache(
repo: &Repository,
index: &gix_index::State,
mode: gix_diff::blob::pipeline::Mode,
attribute_source: gix_worktree::stack::state::attributes::Source,
roots: gix_diff::blob::pipeline::WorktreeRoots,
) -> Result<gix_diff::blob::Platform, resource_cache::Error> {
let diff_algo = repo.config.diff_algorithm()?;
Expand All @@ -129,9 +132,7 @@ mod utils {
// TODO(perf): this could benefit from not having to build an intermediate index,
// and traverse the a tree directly.
index,
// This is an optimization, as we avoid reading files from the working tree, which also
// might not match the index at all depending on what the user passed.
gix_worktree::stack::state::attributes::Source::IdMapping,
attribute_source,
)?
.inner,
);
Expand Down
274 changes: 182 additions & 92 deletions gix/src/object/blob.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,63 +3,132 @@ use crate::{Blob, ObjectDetached};
///
#[cfg(feature = "blob-diff")]
pub mod diff {
use gix_diff::blob::platform::prepare_diff::Operation;
use gix_diff::blob::ResourceKind;
use std::ops::Range;

use crate::{bstr::ByteSlice, object::blob::diff::line::Change};
use crate::object::tree::diff::change::Event;
use crate::{bstr::ByteSlice, object::blob::diff::lines::Change};

/// A platform to keep temporary information to perform line diffs on modified blobs.
///
pub struct Platform<'old, 'new> {
/// The previous version of the blob.
pub old: crate::Object<'old>,
/// The new version of the blob.
pub new: crate::Object<'new>,
/// The algorithm to use when calling [imara_diff::diff()][gix_diff::blob::diff()].
/// This value is determined by the `diff.algorithm` configuration.
pub algo: gix_diff::blob::Algorithm,
pub struct Platform<'a> {
/// The cache holding diffable data related to our blobs.
pub resource_cache: &'a mut gix_diff::blob::Platform,
}

///
pub mod init {
/// The error returned by [`Platform::from_ids()`][super::Platform::from_ids()].
#[derive(Debug, thiserror::Error)]
#[allow(missing_docs)]
pub enum Error {
#[error("Could not find the previous blob or the new blob to diff against")]
FindExisting(#[from] crate::object::find::existing::Error),
#[error("Could not obtain diff algorithm from configuration")]
DiffAlgorithm(#[from] crate::config::diff::algorithm::Error),
}
/// The error returned by [`Platform::from_tree_change()`][super::Platform::from_tree_change()].
pub type Error = gix_diff::blob::platform::set_resource::Error;
}

impl<'old, 'new> Platform<'old, 'new> {
/// Produce a platform for performing various diffs after obtaining the object data of `previous_id` and `new_id`.
///
/// Note that these objects are treated as raw data and are assumed to be blobs.
pub fn from_ids(
previous_id: &crate::Id<'old>,
new_id: &crate::Id<'new>,
) -> Result<Platform<'old, 'new>, init::Error> {
match previous_id
.object()
.and_then(|old| new_id.object().map(|new| (old, new)))
{
Ok((old, new)) => {
let algo = match new_id.repo.config.diff_algorithm() {
Ok(algo) => algo,
Err(err) => return Err(err.into()),
};
Ok(Platform { old, new, algo })
impl<'a, 'new> Platform<'a> {
/// Produce a platform for performing various diffs after obtaining the data from a single `tree_change`.
pub fn from_tree_change(
tree_change: &crate::object::tree::diff::Change<'_, '_, '_>,
resource_cache: &'a mut gix_diff::blob::Platform,
) -> Result<Platform<'a>, init::Error> {
match tree_change.event {
Event::Addition { entry_mode, id } => {
resource_cache.set_resource(
id.repo.object_hash().null(),
entry_mode.kind(),
tree_change.location,
ResourceKind::OldOrSource,
&id.repo.objects,
)?;
resource_cache.set_resource(
id.inner,
entry_mode.kind(),
tree_change.location,
ResourceKind::NewOrDestination,
&id.repo.objects,
)?;
}
Event::Deletion { entry_mode, id } => {
resource_cache.set_resource(
id.inner,
entry_mode.kind(),
tree_change.location,
ResourceKind::OldOrSource,
&id.repo.objects,
)?;
resource_cache.set_resource(
id.repo.object_hash().null(),
entry_mode.kind(),
tree_change.location,
ResourceKind::NewOrDestination,
&id.repo.objects,
)?;
}
Event::Modification {
previous_entry_mode,
previous_id,
entry_mode,
id,
} => {
resource_cache.set_resource(
previous_id.inner,
previous_entry_mode.kind(),
tree_change.location,
ResourceKind::OldOrSource,
&previous_id.repo.objects,
)?;
resource_cache.set_resource(
id.inner,
entry_mode.kind(),
tree_change.location,
ResourceKind::NewOrDestination,
&id.repo.objects,
)?;
}
Event::Rewrite {
source_location,
source_entry_mode,
source_id,
entry_mode,
id,
diff: _,
copy: _,
} => {
resource_cache.set_resource(
source_id.inner,
source_entry_mode.kind(),
source_location,
ResourceKind::OldOrSource,
&source_id.repo.objects,
)?;
resource_cache.set_resource(
id.inner,
entry_mode.kind(),
tree_change.location,
ResourceKind::NewOrDestination,
&id.repo.objects,
)?;
}
Err(err) => Err(err.into()),
}
Ok(Self { resource_cache })
}
}

///
pub mod line {
pub mod lines {
use crate::bstr::BStr;

/// The error returned by [Platform::lines()](super::Platform::lines()).
#[derive(Debug, thiserror::Error)]
#[allow(missing_docs)]
pub enum Error<E>
where
E: std::error::Error + Send + Sync + 'static,
{
#[error(transparent)]
ProcessHunk(E),
#[error(transparent)]
PrepareDiff(#[from] gix_diff::blob::platform::prepare_diff::Error),
}

/// A change to a hunk of lines.
pub enum Change<'a, 'data> {
/// Lines were added.
Expand All @@ -82,70 +151,91 @@ pub mod diff {
}
}

impl<'old, 'new> Platform<'old, 'new> {
impl<'a> Platform<'a> {
/// Perform a diff on lines between the old and the new version of a blob, passing each hunk of lines to `process_hunk`.
/// The diffing algorithm is determined by the `diff.algorithm` configuration.
///
/// Note that you can invoke the diff more flexibly as well.
/// The diffing algorithm is determined by the `diff.algorithm` configuration, or individual diff drivers.
/// Note that `process_hunk` is not called if one of the involved resources are binary, but that can be determined
/// by introspecting the outcome.
// TODO: more tests (only tested insertion right now)
pub fn lines<FnH, E>(&self, mut process_hunk: FnH) -> Result<(), E>
pub fn lines<FnH, E>(
&mut self,
mut process_hunk: FnH,
) -> Result<gix_diff::blob::platform::prepare_diff::Outcome<'_>, lines::Error<E>>
where
FnH: FnMut(line::Change<'_, '_>) -> Result<(), E>,
E: std::error::Error,
FnH: FnMut(lines::Change<'_, '_>) -> Result<(), E>,
E: std::error::Error + Send + Sync + 'static,
{
let input = self.line_tokens();
let mut err = None;
let mut lines = Vec::new();
gix_diff::blob::diff(self.algo, &input, |before: Range<u32>, after: Range<u32>| {
if err.is_some() {
return;
self.resource_cache.options.skip_internal_diff_if_external_is_configured = false;

let prep = self.resource_cache.prepare_diff()?;
match prep.operation {
Operation::InternalDiff { algorithm } => {
let input = prep.interned_input();
let mut err = None;
let mut lines = Vec::new();

gix_diff::blob::diff(algorithm, &input, |before: Range<u32>, after: Range<u32>| {
if err.is_some() {
return;
}
lines.clear();
lines.extend(
input.before[before.start as usize..before.end as usize]
.iter()
.map(|&line| input.interner[line].as_bstr()),
);
let end_of_before = lines.len();
lines.extend(
input.after[after.start as usize..after.end as usize]
.iter()
.map(|&line| input.interner[line].as_bstr()),
);
let hunk_before = &lines[..end_of_before];
let hunk_after = &lines[end_of_before..];
if hunk_after.is_empty() {
err = process_hunk(Change::Deletion { lines: hunk_before }).err();
} else if hunk_before.is_empty() {
err = process_hunk(Change::Addition { lines: hunk_after }).err();
} else {
err = process_hunk(Change::Modification {
lines_before: hunk_before,
lines_after: hunk_after,
})
.err();
}
});

if let Some(err) = err {
return Err(lines::Error::ProcessHunk(err));
}
}
lines.clear();
lines.extend(
input.before[before.start as usize..before.end as usize]
.iter()
.map(|&line| input.interner[line].as_bstr()),
);
let end_of_before = lines.len();
lines.extend(
input.after[after.start as usize..after.end as usize]
.iter()
.map(|&line| input.interner[line].as_bstr()),
);
let hunk_before = &lines[..end_of_before];
let hunk_after = &lines[end_of_before..];
if hunk_after.is_empty() {
err = process_hunk(Change::Deletion { lines: hunk_before }).err();
} else if hunk_before.is_empty() {
err = process_hunk(Change::Addition { lines: hunk_after }).err();
} else {
err = process_hunk(Change::Modification {
lines_before: hunk_before,
lines_after: hunk_after,
})
.err();
Operation::ExternalCommand { .. } => {
unreachable!("we disabled that")
}
});

match err {
Some(err) => Err(err),
None => Ok(()),
}
Operation::SourceOrDestinationIsBinary => {}
};
Ok(prep)
}

/// Count the amount of removed and inserted lines efficiently.
pub fn line_counts(&self) -> gix_diff::blob::sink::Counter<()> {
let tokens = self.line_tokens();
gix_diff::blob::diff(self.algo, &tokens, gix_diff::blob::sink::Counter::default())
}
/// Note that nothing will happen if one of the inputs is binary, and `None` will be returned.
pub fn line_counts(
&mut self,
) -> Result<Option<gix_diff::blob::sink::Counter<()>>, gix_diff::blob::platform::prepare_diff::Error> {
self.resource_cache.options.skip_internal_diff_if_external_is_configured = false;

/// Return a tokenizer which treats lines as smallest unit for use in a [diff operation][gix_diff::blob::diff()].
///
/// The line separator is determined according to normal git rules and filters.
pub fn line_tokens(&self) -> gix_diff::blob::intern::InternedInput<&[u8]> {
// TODO: make use of `core.eol` and/or filters to do line-counting correctly. It's probably
// OK to just know how these objects are saved to know what constitutes a line.
gix_diff::blob::intern::InternedInput::new(self.old.data.as_bytes(), self.new.data.as_bytes())
let prep = self.resource_cache.prepare_diff()?;
match prep.operation {
Operation::InternalDiff { algorithm } => {
let tokens = prep.interned_input();
let counter = gix_diff::blob::diff(algorithm, &tokens, gix_diff::blob::sink::Counter::default());
Ok(Some(counter))
}
Operation::ExternalCommand { .. } => {
unreachable!("we disabled that")
}
Operation::SourceOrDestinationIsBinary => Ok(None),
}
}
}
}
Expand Down
32 changes: 16 additions & 16 deletions gix/src/object/tree/diff/change.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,25 +68,25 @@ pub enum Event<'a, 'old, 'new> {
},
}

impl<'a, 'old, 'new> Event<'a, 'old, 'new> {
/// Produce a platform for performing a line-diff, or `None` if this is not a [`Modification`][Event::Modification]
/// or one of the entries to compare is not a blob.
pub fn diff(
impl<'a, 'old, 'new> super::Change<'a, 'old, 'new> {
/// Produce a platform for performing a line-diff no matter whether the underlying [Event] is an addition, modification,
/// deletion or rewrite.
/// Use `resource_cache` to store the diffable data and possibly reuse previously stored data.
/// Afterwards the platform, which holds on to `resource_cache`, can be used to perform ready-made operations on the
/// pre-set resources.
///
/// ### Warning about Memory Consumption
///
/// `resource_cache` only grows, so one should call [`gix_diff::blob::Platform::clear_resource_cache`] occasionally.
pub fn diff<'b>(
&self,
) -> Option<Result<crate::object::blob::diff::Platform<'old, 'new>, crate::object::blob::diff::init::Error>> {
match self {
Event::Modification {
previous_entry_mode,
previous_id,
entry_mode,
id,
} if entry_mode.is_blob() && previous_entry_mode.is_blob() => {
Some(crate::object::blob::diff::Platform::from_ids(previous_id, id))
}
_ => None,
}
resource_cache: &'b mut gix_diff::blob::Platform,
) -> Result<crate::object::blob::diff::Platform<'b>, crate::object::blob::diff::init::Error> {
crate::object::blob::diff::Platform::from_tree_change(self, resource_cache)
}
}

impl<'a, 'old, 'new> Event<'a, 'old, 'new> {
/// Return the current mode of this instance.
pub fn entry_mode(&self) -> gix_object::tree::EntryMode {
match self {
Expand Down
Loading

0 comments on commit 4743212

Please sign in to comment.