Skip to content

Commit

Permalink
feat: add basic connectivity check
Browse files Browse the repository at this point in the history
Implement a simple connectivity check in a new `gix-fsck` crate, and add
this to `gix` via a new `fsck` subcommand. Currently this is
functionally equivalent to:
`git rev-list --objects --quiet --missing=print`

feat: add basic connectivity check - address review feedback
  • Loading branch information
cesfahani authored and Byron committed Nov 10, 2023
1 parent e47c46d commit 8f795e8
Show file tree
Hide file tree
Showing 18 changed files with 378 additions and 2 deletions.
12 changes: 12 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,7 @@ members = [
"gix-archive",
"gix-worktree-stream",
"gix-revwalk",
"gix-fsck",

"tests/tools",

Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ is usable to some extent.
* [gix-tui](https://github.com/Byron/gitoxide/blob/main/crate-status.md#gix-tui)
* [gix-tix](https://github.com/Byron/gitoxide/blob/main/crate-status.md#gix-tix)
* [gix-bundle](https://github.com/Byron/gitoxide/blob/main/crate-status.md#gix-bundle)
* [gix-fsck](https://github.com/Byron/gitoxide/blob/main/crate-status.md#gix-fsck)

### Stress Testing
* [x] Verify huge packs
Expand Down
13 changes: 13 additions & 0 deletions crate-status.md
Original file line number Diff line number Diff line change
Expand Up @@ -775,6 +775,19 @@ See its [README.md](https://github.com/Byron/gitoxide/blob/main/gix-lock/README.
* [x] validate submodule names
* [x] [validate][tagname-validation] tag names

### gix-fsck
* [x] validate connectivity when provided a specific commit as a starting point
* [ ] validate connectivity of all `refs` in the index
* [ ] progress reporting and interruptability
* [ ] identify objects that exist but are not reference by any reference nodes (e.g. `refs` or a provided commit)
* [ ] add support for various options
* [ ] write dangling objects to the `.git/log-found` directory structure
* [ ] `strict` mode, to check for tree objects with `g+w` permissions
* [ ] consider reflog entries as reference nodes/heads
* [ ] when reporting reachable objects, include _how_ they are reachable
* [ ] which reference node(s) made them reachable
* [ ] parent commit(s)

### gix-ref
* [ ] Prepare code for arrival of longer hashes like Sha256. It's part of the [V2 proposal][reftable-v2] but should work for loose refs as well.
* **Stores**
Expand Down
1 change: 1 addition & 0 deletions gitoxide-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ gix-pack-for-configuration-only = { package = "gix-pack", version = "^0.44.0", p
gix-transport-configuration-only = { package = "gix-transport", version = "^0.38.0", path = "../gix-transport", default-features = false }
gix-archive-for-configuration-only = { package = "gix-archive", version = "^0.6.0", path = "../gix-archive", optional = true, features = ["tar", "tar_gz"] }
gix-status = { version = "^0.2.0", path = "../gix-status" }
gix-fsck = { version = "^0.0.0", path = "../gix-fsck" }
serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] }
anyhow = "1.0.42"
thiserror = "1.0.34"
Expand Down
40 changes: 40 additions & 0 deletions gitoxide-core/src/repository/fsck.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
use std::io::{BufWriter, Write};

use anyhow::Context;
use gix::{objs::Kind, ObjectId};

pub fn connectivity(mut repo: gix::Repository, spec: Option<String>, out: impl std::io::Write) -> anyhow::Result<()> {
let mut out = BufWriter::with_capacity(64 * 1024, out);
let spec = spec.unwrap_or("HEAD".into());

repo.object_cache_size_if_unset(4 * 1024 * 1024);
// We expect to be finding a bunch of non-existent objects here - never refresh the ODB
repo.objects.refresh_never();

let id = repo
.rev_parse_single(spec.as_str())
.context("Only single revisions are supported")?;
let commits: gix::revision::Walk<'_> = id
.object()?
.peel_to_kind(gix::object::Kind::Commit)
.context("Need commitish as starting point")?
.id()
.ancestors()
.all()?;

let missing_cb = |oid: &ObjectId, kind: Kind| {
writeln!(out, "{oid}: {kind}").expect("failed to write output");
};
let mut conn = gix_fsck::ConnectivityCheck::new(&repo.objects, missing_cb);

// Walk all commits, checking each one for connectivity
for commit in commits {
let commit = commit?;
conn.check_commit(&commit.id);
for parent in commit.parent_ids {
conn.check_commit(&parent);
}
}

Ok(())
}
1 change: 1 addition & 0 deletions gitoxide-core/src/repository/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ pub use clone::function::clone;
pub use fetch::function::fetch;

pub mod commitgraph;
pub mod fsck;
pub mod index;
pub mod mailmap;
pub mod odb;
Expand Down
22 changes: 22 additions & 0 deletions gix-fsck/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
[package]
name = "gix-fsck"
version = "0.0.0"
repository = "https://github.com/Byron/gitoxide"
authors = ["Cameron Esfahani <[email protected]>", "Sebastian Thiel <[email protected]>"]
license = "MIT OR Apache-2.0"
description = "Verifies the connectivity and validity of objects in the database"
edition = "2021"
include = ["src/**/*", "LICENSE-*", "CHANGELOG.md"]
rust-version = "1.65"

[lib]
doctest = false

[dependencies]
gix-hash = { version = "^0.13.1", path = "../gix-hash" }
gix-hashtable = { version = "^0.4.0", path = "../gix-hashtable" }
gix-object = { version = "^0.38.0", path = "../gix-object" }

[dev-dependencies]
gix-odb = { path = "../gix-odb" }
gix-testtools = { path = "../tests/tools"}
6 changes: 6 additions & 0 deletions gix-fsck/Changelog.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Changelog

All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
1 change: 1 addition & 0 deletions gix-fsck/LICENSE-APACHE
1 change: 1 addition & 0 deletions gix-fsck/LICENSE-MIT
127 changes: 127 additions & 0 deletions gix-fsck/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
//! A library for performing object database integrity and connectivity checks
#![deny(rust_2018_idioms)]

use gix_hash::ObjectId;
use gix_hashtable::HashSet;
use gix_object::{tree::EntryMode, Exists, FindExt, Kind};

pub struct ConnectivityCheck<'a, T, F>
where
T: FindExt + Exists,
F: FnMut(&ObjectId, Kind),
{
/// ODB handle to use for the check
db: &'a T,
/// Closure to invoke when a missing object is encountered
missing_cb: F,
/// Set of Object IDs already (or about to be) scanned during the check
oid_set: HashSet,
/// Single buffer for decoding objects from the ODB
/// This is slightly faster than allocating throughout the connectivity check (and reduces the memory requirements)
buf: Vec<u8>,
}

impl<'a, T, F> ConnectivityCheck<'a, T, F>
where
T: FindExt + Exists,
F: FnMut(&ObjectId, Kind),
{
/// Instantiate a connectivity check
pub fn new(db: &'a T, missing_cb: F) -> ConnectivityCheck<'a, T, F> {
ConnectivityCheck {
db,
missing_cb,
oid_set: HashSet::default(),
buf: Vec::new(),
}
}

/// Run the connectivity check on the provided commit object ID
/// - This will walk the trees and blobs referenced by the commit and verify they exist in the ODB
/// - Any objects previously encountered by this [`ConnectivityCheck`] instance will be skipped
/// - Any referenced blobs that are not present in the ODB will result in a call to the `missing_cb`
/// - Missing commits or trees will currently result in panic
/// - TODO: consider how to handle a missing commit (invoke `missing_cb`, or possibly return a Result?)
pub fn check_commit(&mut self, oid: &ObjectId) {
// Attempt to insert the commit ID in the set, and if already present, return immediately
if !self.oid_set.insert(*oid) {
return;
}
// Obtain the commit's tree ID
let tree_id = {
let commit = self.db.find_commit(oid, &mut self.buf).expect("failed to find commit");
commit.tree()
};

// Attempt to insert the tree ID in the set, and if already present, return immediately
if self.oid_set.insert(tree_id) {
self.check_tree(&tree_id);
}
}

fn check_tree(&mut self, oid: &ObjectId) {
let tree = match self.db.find_tree(oid, &mut self.buf) {
Ok(tree) => tree,
Err(_) => {
// Tree is missing, so invoke `missing_cb`
(self.missing_cb)(oid, Kind::Tree);
return;
}
};

// Keeping separate sets for trees and blobs for now...
// This is about a wash when compared to using a HashMap<ObjectID, Kind>
struct TreeEntries {
trees: HashSet<ObjectId>,
blobs: HashSet<ObjectId>,
}

// Build up a set of trees and a set of blobs
let entries: TreeEntries = {
let mut entries = TreeEntries {
trees: HashSet::default(),
blobs: HashSet::default(),
};

// For each entry in the tree
for entry_ref in tree.entries.iter() {
match entry_ref.mode {
EntryMode::Tree => {
let tree_id = entry_ref.oid.to_owned();
// Check if the tree has already been encountered
if self.oid_set.insert(tree_id) {
entries.trees.insert(tree_id);
}
}
EntryMode::Blob | EntryMode::BlobExecutable | EntryMode::Link => {
let blob_id = entry_ref.oid.to_owned();
// Check if the blob has already been encountered
if self.oid_set.insert(blob_id) {
entries.blobs.insert(blob_id);
}
}
EntryMode::Commit => {
// This implies a submodule (OID is the commit hash of the submodule)
// Skip it as it's not in this repository!
}
}
}
entries
};

for tree_id in entries.trees.iter() {
self.check_tree(tree_id);
}
for blob_id in entries.blobs.iter() {
self.check_blob(blob_id);
}
}

fn check_blob(&mut self, oid: &ObjectId) {
// Check if the blob is missing from the ODB
if !self.db.exists(oid) {
// Blob is missing, so invoke `missing_cb`
(self.missing_cb)(oid, Kind::Blob);
}
}
}
84 changes: 84 additions & 0 deletions gix-fsck/tests/connectivity/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
use std::ops::Deref;

use gix_fsck::ConnectivityCheck;
use gix_hash::ObjectId;
use gix_hashtable::HashMap;
use gix_object::Kind;
use gix_testtools::once_cell::sync::Lazy;

use crate::hex_to_id;

fn check_missing<'a>(repo_name: &str, commits: impl IntoIterator<Item = &'a ObjectId>) -> HashMap<ObjectId, Kind> {
let db = {
let fixture_path = gix_testtools::scripted_fixture_read_only("make_test_repos.sh")
.expect("fixture path")
.join(repo_name)
.join(".git")
.join("objects");
let mut db = gix_odb::at(fixture_path).expect("odb handle");
db.refresh_never();
db
};

let mut missing: HashMap<ObjectId, Kind> = HashMap::default();
let callback = |oid: &ObjectId, kind: Kind| {
missing.try_insert(*oid, kind).expect("no duplicate oid");
};

let mut check = ConnectivityCheck::new(&db, callback);
for commit in commits.into_iter() {
check.check_commit(commit);
}

missing
}

fn hex_to_ids<'a>(hex_ids: impl IntoIterator<Item = &'a str>) -> Vec<ObjectId> {
hex_ids.into_iter().map(hex_to_id).collect()
}

fn hex_to_objects<'a>(hex_ids: impl IntoIterator<Item = &'a str>, kind: Kind) -> HashMap<ObjectId, Kind> {
hex_to_ids(hex_ids)
.into_iter()
.map(|id| (id, kind))
.collect::<HashMap<_, _>>()
}

// Get a `&Vec<ObjectID` for each commit in the test fixture repository
fn all_commits<'a>() -> &'a Vec<ObjectId> {
static ALL_COMMITS: Lazy<Vec<ObjectId>> = Lazy::new(|| {
hex_to_ids(vec![
"5d18db2e2aabadf7b914435ef34f2faf8b4546dd",
"3a3dfaa55a515f3fb3a25751107bbb523af6a1b0",
"734c926856a328d1168ffd7088532e0d1ad19bbe",
])
});
ALL_COMMITS.deref()
}

#[test]
fn no_missing() {
// The "base" repo is the original, and has every object present
assert_eq!(check_missing("base", all_commits()), HashMap::default());
}

#[test]
fn missing_blobs() {
// The "blobless" repo is cloned with `--filter=blob:none`, and is missing one blob
let expected = hex_to_objects(vec!["c18147dc648481eeb65dc5e66628429a64843327"], Kind::Blob);
assert_eq!(check_missing("blobless", all_commits()), expected);
}

#[test]
fn missing_trees() {
// The "treeless" repo is cloned with `--filter=tree:0`, and is missing two trees
// NOTE: This repo is also missing a blob, but we have no way of knowing that, as the tree referencing it is missing
let expected = hex_to_objects(
vec![
"9561cfbae43c5e2accdfcd423378588dd10d827f",
"fc264b3b6875a46e9031483aeb9994a1b897ffd3",
],
Kind::Tree,
);
assert_eq!(check_missing("treeless", all_commits()), expected);
}
1 change: 1 addition & 0 deletions gix-fsck/tests/fixtures/generated-archives/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
make_test_repos.tar.xz
Loading

0 comments on commit 8f795e8

Please sign in to comment.