Skip to content

Commit

Permalink
[commitgraph] Implement basic commit-graph file verification.
Browse files Browse the repository at this point in the history
Missing features:
1. It operates on commit-graph files only, so it doesn't verify that
   commit-graph data matches `git-odb` data.
2. No progress reporting or parallelization. This shouldn't be needed
   until until we need to check against `git-odb` data.

Example output for Linux repo:
```
$ time ./target/release/gixp commit-graph-verify -s ~/src/linux/.git/objects/info
number of commits with the given number of parents
	 0: 4
	 1: 878988
	 2: 67800
	 3: 652
	 4: 408
	 5: 382
	 6: 454
	 7: 95
	 8: 65
	 9: 47
	10: 25
	11: 26
	12: 14
	13: 4
	14: 3
	18: 1
	19: 1
	20: 1
	21: 1
	24: 1
	27: 1
	30: 1
	32: 1
	66: 1
	->: 948976

longest path length between two commits: 160521

real	0m0.196s
user	0m0.180s
sys	0m0.016s
```
  • Loading branch information
avoidscorn committed Oct 11, 2020
1 parent 701f33c commit 2571113
Show file tree
Hide file tree
Showing 19 changed files with 459 additions and 21 deletions.
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion git-commitgraph/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,17 @@ include = ["src/**/*"]
[lib]
doctest = false

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[features]
serde1 = ["serde", "git-object/serde1"]

[dependencies]
git-features = { version = "^0.6.0", path = "../git-features" }
git-object = { version = "^0.4.0", path = "../git-object" }

bstr = { version = "0.2.13", default-features = false, features = ["std"] }
byteorder = "1.2.3"
filebuffer = "0.4.0"
serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] }
thiserror = "1.0.20"

[dev-dependencies]
Expand Down
4 changes: 4 additions & 0 deletions git-commitgraph/src/file/access.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ use std::{

/// Access
impl File {
pub fn base_graph_count(&self) -> u8 {
self.base_graph_count
}

/// Returns the commit data for the commit located at the given lexigraphical position.
///
/// `pos` must range from 0 to self.num_commits().
Expand Down
4 changes: 4 additions & 0 deletions git-commitgraph/src/file/commit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@ impl<'a> Commit<'a> {
self.iter_parents().next().transpose()
}

pub fn position(&self) -> file::Position {
self.pos
}

// Allow the return value to outlive this Commit object, as it only needs to be bound by the
// lifetime of the parent file.
pub fn root_tree_id<'b>(&'b self) -> borrowed::Id<'a>
Expand Down
2 changes: 2 additions & 0 deletions git-commitgraph/src/file/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
mod access;
pub mod commit;
mod init;
pub mod verify;

pub use init::Error;

pub use commit::Commit;
Expand Down
167 changes: 167 additions & 0 deletions git-commitgraph/src/file/verify.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
use crate::{
file::{self, File},
GENERATION_NUMBER_INFINITY, GENERATION_NUMBER_MAX,
};
use bstr::ByteSlice;
use git_object::{borrowed, owned, SHA1_SIZE};
use std::cmp::{max, min};
use std::collections::HashMap;
use std::convert::TryFrom;
use std::path::Path;

#[derive(thiserror::Error, Debug)]
pub enum Error {
#[error(transparent)]
Commit(#[from] file::commit::Error),
#[error("commit at file position {pos} has invalid ID {id}")]
CommitId { id: owned::Id, pos: file::Position },
#[error("commit at file position {pos} with ID {id} is out of order relative to its predecessor with ID {predecessor_id}")]
CommitsOutOfOrder {
id: owned::Id,
pos: file::Position,
predecessor_id: owned::Id,
},
#[error("commit-graph filename should be {0}")]
Filename(String),
#[error("commit {id} has invalid generation {generation}")]
Generation { generation: u32, id: owned::Id },
#[error("checksum mismatch: expected {expected}, got {actual}")]
Mismatch { expected: owned::Id, actual: owned::Id },
#[error("commit {id} has invalid root tree ID {root_tree_id}")]
RootTreeId { id: owned::Id, root_tree_id: owned::Id },
}

// This is a separate type to let `traverse`'s caller use the same error type for its result and its
// processor error type while also letting that error type contain file::verify::Error values.
// Is there a better way? Should the caller's error type just use boxes to avoid recursive type
// errors?
#[derive(thiserror::Error, Debug)]
pub enum EitherError<E1: std::error::Error + 'static, E2: std::error::Error + 'static> {
#[error(transparent)]
Internal(#[from] E1),
// Why can't I use #[from] here? Boo!
#[error("{0}")]
Processor(#[source] E2),
}

#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "serde1", derive(serde::Deserialize, serde::Serialize))]
pub struct Outcome {
pub max_generation: u32,
pub max_parents: u32,
pub min_generation: u32,
pub num_commits: u32,
pub parent_counts: HashMap<u32, u32>,
}

impl File {
pub fn checksum(&self) -> borrowed::Id<'_> {
borrowed::Id::try_from(&self.data[self.data.len() - SHA1_SIZE..]).expect("file to be large enough for a hash")
}

pub fn traverse<'a, E, Processor>(&'a self, mut processor: Processor) -> Result<Outcome, EitherError<Error, E>>
where
E: std::error::Error + 'static,
Processor: FnMut(&file::Commit<'a>) -> Result<(), E>,
{
self.verify_checksum()?;
verify_split_chain_filename_hash(&self.path, self.checksum())?;

// This probably belongs in borrowed::Id itself?
let null_id = borrowed::Id::from(&[0u8; SHA1_SIZE]);

let mut stats = Outcome {
max_generation: 0,
max_parents: 0,
min_generation: GENERATION_NUMBER_INFINITY,
num_commits: self.num_commits(),
parent_counts: HashMap::new(),
};

// TODO: Verify self.fan values as we go.
let mut prev_id: borrowed::Id<'a> = null_id;
for commit in self.iter_commits() {
if commit.id() <= prev_id {
if commit.id() == null_id {
return Err(Error::CommitId {
pos: commit.position(),
id: commit.id().into(),
}
.into());
}
return Err(Error::CommitsOutOfOrder {
pos: commit.position(),
id: commit.id().into(),
predecessor_id: prev_id.into(),
}
.into());
}
if commit.root_tree_id() == null_id {
return Err(Error::RootTreeId {
id: commit.id().into(),
root_tree_id: commit.root_tree_id().into(),
}
.into());
}
if commit.generation() > GENERATION_NUMBER_MAX {
return Err(Error::Generation {
generation: commit.generation(),
id: commit.id().into(),
}
.into());
}

processor(&commit).map_err(EitherError::Processor)?;

stats.max_generation = max(stats.max_generation, commit.generation());
stats.min_generation = min(stats.min_generation, commit.generation());
let parent_count = commit
.iter_parents()
.try_fold(0u32, |acc, pos| pos.map(|_| acc + 1))
.map_err(Error::Commit)?;
*stats.parent_counts.entry(parent_count).or_insert(0) += 1;
prev_id = commit.id();
}

if stats.min_generation == GENERATION_NUMBER_INFINITY {
stats.min_generation = 0;
}

Ok(stats)
}

pub fn verify_checksum(&self) -> Result<owned::Id, Error> {
// TODO: Use/copy git_odb::hash::bytes_of_file.
let data_len_without_trailer = self.data.len() - SHA1_SIZE;
let mut hasher = git_features::hash::Sha1::default();
hasher.update(&self.data[..data_len_without_trailer]);
let actual = owned::Id::new_sha1(hasher.digest());

let expected = self.checksum();
if actual.to_borrowed() == expected {
Ok(actual)
} else {
Err(Error::Mismatch {
actual,
expected: expected.into(),
})
}
}
}

/// If the given path's filename matches "graph-{hash}.graph", check that `hash` matches the
/// expected hash.
fn verify_split_chain_filename_hash(path: impl AsRef<Path>, expected: borrowed::Id<'_>) -> Result<(), Error> {
let path = path.as_ref();
path.file_name()
.and_then(|filename| filename.to_str())
.and_then(|filename| filename.strip_suffix(".graph"))
.and_then(|stem| stem.strip_prefix("graph-"))
.map_or(Ok(()), |hex| match owned::Id::from_40_bytes_in_hex(hex.as_bytes()) {
Ok(actual) if actual.to_borrowed() == expected => Ok(()),
_ => Err(Error::Filename(format!(
"graph-{}.graph",
expected.to_sha1_hex().as_bstr()
))),
})
}
12 changes: 7 additions & 5 deletions git-commitgraph/src/graph/access.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ impl Graph {

/// Access fundamentals
impl Graph {
fn lookup_by_id(&self, id: borrowed::Id<'_>) -> Option<LookupByIdResult<'_>> {
pub(crate) fn lookup_by_id(&self, id: borrowed::Id<'_>) -> Option<LookupByIdResult<'_>> {
let mut current_file_start = 0;
for file in &self.files {
if let Some(lex_pos) = file.lookup(id) {
Expand All @@ -57,14 +57,15 @@ impl Graph {
None
}

fn lookup_by_pos(&self, pos: graph::Position) -> LookupByPositionResult<'_> {
pub(crate) fn lookup_by_pos(&self, pos: graph::Position) -> LookupByPositionResult<'_> {
let mut remaining = pos.0;
for file in &self.files {
for (file_index, file) in self.files.iter().enumerate() {
match remaining.checked_sub(file.num_commits()) {
Some(v) => remaining = v,
None => {
return LookupByPositionResult {
file,
file_index,
pos: file::Position(remaining),
}
}
Expand All @@ -75,14 +76,15 @@ impl Graph {
}

#[derive(Clone)]
struct LookupByIdResult<'a> {
pub(crate) struct LookupByIdResult<'a> {
pub file: &'a File,
pub graph_pos: graph::Position,
pub file_pos: file::Position,
}

#[derive(Clone)]
struct LookupByPositionResult<'a> {
pub(crate) struct LookupByPositionResult<'a> {
pub file: &'a File,
pub file_index: usize,
pub pos: file::Position,
}
1 change: 1 addition & 0 deletions git-commitgraph/src/graph/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
//! Operations on a complete commit graph.
mod access;
mod init;
pub mod verify;

use crate::file::File;
use std::fmt;
Expand Down
Loading

0 comments on commit 2571113

Please sign in to comment.