Skip to content

Commit

Permalink
refactor
Browse files Browse the repository at this point in the history
- update `gix-fsck` crate status with all obvious features of `git-fsck`
- clarify that fsck is only partial just to be sure people won't use it in place of `git fsck`
- if BufWriter's are truly needed, let's expect it as type to avoid double-buffering unnecessarily.
- consistent naming of the changelog markdown fail (all caps)
- set version to 0.1.0 for fsck crate as 0.0 is used only for name-keeping when there is no functionality.
- general cleanup
- avoid allocating for each tree (but it's still recursive)
  • Loading branch information
Byron committed Nov 10, 2023
1 parent 8f795e8 commit 7a88b42
Show file tree
Hide file tree
Showing 12 changed files with 156 additions and 124 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

24 changes: 14 additions & 10 deletions crate-status.md
Original file line number Diff line number Diff line change
Expand Up @@ -776,17 +776,21 @@ See its [README.md](https://github.com/Byron/gitoxide/blob/main/gix-lock/README.
* [x] [validate][tagname-validation] tag names

### gix-fsck
* [x] validate connectivity when provided a specific commit as a starting point
* [ ] validate connectivity of all `refs` in the index
* [x] validate connectivity and find missing objects starting from…
- [x] commits
- [ ] tags
- [ ] tree-cache in the `index` or any entry within
* [ ] validate object hashes during connectivity traversal
* [ ] progress reporting and interruptability
* [ ] identify objects that exist but are not reference by any reference nodes (e.g. `refs` or a provided commit)
* [ ] add support for various options
* [ ] write dangling objects to the `.git/log-found` directory structure
* [ ] `strict` mode, to check for tree objects with `g+w` permissions
* [ ] consider reflog entries as reference nodes/heads
* [ ] when reporting reachable objects, include _how_ they are reachable
* [ ] which reference node(s) made them reachable
* [ ] parent commit(s)
* [ ] skipList to exclude objects which are known to be broken
* [ ] validate blob hashes (connectivity check
* [ ] identify objects that exist but are not reachable (i.e. what remains after a full graph traversal from all valid starting points)
* [ ] write dangling objects to the `.git/log-found` directory structure
* [ ] `strict` mode, to check for tree objects with `g+w` permissions
* [ ] consider reflog entries from `ref` starting points
* [ ] when reporting reachable objects, provide the path through which they are reachable, i.e. ref-log@{3} -> commit -> tree -> path-in-tree
* [ ] limit search to ODB without alternates (default is equivalent to `git fsck --full` due to ODB implementation)
* [ ] all individual [checks available in `git fsck`](https://git-scm.com/docs/git-fsck#_fsck_messages) (*too many to print here*)

### gix-ref
* [ ] Prepare code for arrival of longer hashes like Sha256. It's part of the [V2 proposal][reftable-v2] but should work for loose refs as well.
Expand Down
1 change: 1 addition & 0 deletions etc/check-package-size.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ function indent () {
}

echo "in root: gitoxide CLI"
(enter gix-fsck && indent cargo diet -n --package-size-limit 10KB)
(enter gix-actor && indent cargo diet -n --package-size-limit 10KB)
(enter gix-archive && indent cargo diet -n --package-size-limit 10KB)
(enter gix-worktree-stream && indent cargo diet -n --package-size-limit 40KB)
Expand Down
2 changes: 1 addition & 1 deletion gitoxide-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ gix-pack-for-configuration-only = { package = "gix-pack", version = "^0.44.0", p
gix-transport-configuration-only = { package = "gix-transport", version = "^0.38.0", path = "../gix-transport", default-features = false }
gix-archive-for-configuration-only = { package = "gix-archive", version = "^0.6.0", path = "../gix-archive", optional = true, features = ["tar", "tar_gz"] }
gix-status = { version = "^0.2.0", path = "../gix-status" }
gix-fsck = { version = "^0.0.0", path = "../gix-fsck" }
gix-fsck = { version = "^0.1.0", path = "../gix-fsck" }
serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] }
anyhow = "1.0.42"
thiserror = "1.0.34"
Expand Down
22 changes: 11 additions & 11 deletions gitoxide-core/src/repository/fsck.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
use std::io::{BufWriter, Write};

use anyhow::Context;
use gix::{objs::Kind, ObjectId};

pub fn connectivity(mut repo: gix::Repository, spec: Option<String>, out: impl std::io::Write) -> anyhow::Result<()> {
let mut out = BufWriter::with_capacity(64 * 1024, out);
pub fn connectivity(
mut repo: gix::Repository,
spec: Option<String>,
mut out: impl std::io::Write,
) -> anyhow::Result<()> {
let spec = spec.unwrap_or("HEAD".into());

repo.object_cache_size_if_unset(4 * 1024 * 1024);
Expand All @@ -22,19 +23,18 @@ pub fn connectivity(mut repo: gix::Repository, spec: Option<String>, out: impl s
.ancestors()
.all()?;

let missing_cb = |oid: &ObjectId, kind: Kind| {
let on_missing = |oid: &ObjectId, kind: Kind| {
writeln!(out, "{oid}: {kind}").expect("failed to write output");
};
let mut conn = gix_fsck::ConnectivityCheck::new(&repo.objects, missing_cb);

let mut check = gix_fsck::Connectivity::new(&repo.objects, on_missing);
// Walk all commits, checking each one for connectivity
for commit in commits {
let commit = commit?;
conn.check_commit(&commit.id);
for parent in commit.parent_ids {
conn.check_commit(&parent);
}
check.check_commit(&commit.id);
// Note that we leave parent-iteration to the commits iterator, as it will
// correctly handle shallow repositories which are expected to have the commits
// along the shallow boundary missing.
}

Ok(())
}
File renamed without changes.
4 changes: 2 additions & 2 deletions gix-fsck/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
[package]
name = "gix-fsck"
version = "0.0.0"
version = "0.1.0"
repository = "https://github.com/Byron/gitoxide"
authors = ["Cameron Esfahani <[email protected]>", "Sebastian Thiel <[email protected]>"]
license = "MIT OR Apache-2.0"
description = "Verifies the connectivity and validity of objects in the database"
edition = "2021"
include = ["src/**/*", "LICENSE-*", "CHANGELOG.md"]
include = ["src/**/*", "LICENSE-*"]
rust-version = "1.65"

[lib]
Expand Down
190 changes: 112 additions & 78 deletions gix-fsck/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,127 +1,161 @@
//! A library for performing object database integrity and connectivity checks
#![deny(rust_2018_idioms)]
#![deny(rust_2018_idioms, unsafe_code, missing_docs)]

use gix_hash::ObjectId;
use gix_hashtable::HashSet;
use gix_object::{tree::EntryMode, Exists, FindExt, Kind};
use std::cell::RefCell;
use std::ops::{Deref, DerefMut};

pub struct ConnectivityCheck<'a, T, F>
/// Perform a connectivity check.
pub struct Connectivity<T, F>
where
T: FindExt + Exists,
F: FnMut(&ObjectId, Kind),
{
/// ODB handle to use for the check
db: &'a T,
db: T,
/// Closure to invoke when a missing object is encountered
missing_cb: F,
/// Set of Object IDs already (or about to be) scanned during the check
oid_set: HashSet,
/// Single buffer for decoding objects from the ODB
/// This is slightly faster than allocating throughout the connectivity check (and reduces the memory requirements)
buf: Vec<u8>,
/// A free-list of buffers for recursive tree decoding.
free_list: FreeList,
}

impl<'a, T, F> ConnectivityCheck<'a, T, F>
impl<T, F> Connectivity<T, F>
where
T: FindExt + Exists,
F: FnMut(&ObjectId, Kind),
{
/// Instantiate a connectivity check
pub fn new(db: &'a T, missing_cb: F) -> ConnectivityCheck<'a, T, F> {
ConnectivityCheck {
/// Instantiate a connectivity check.
pub fn new(db: T, missing_cb: F) -> Connectivity<T, F> {
Connectivity {
db,
missing_cb,
oid_set: HashSet::default(),
buf: Vec::new(),
free_list: Default::default(),
}
}

/// Run the connectivity check on the provided commit object ID
/// - This will walk the trees and blobs referenced by the commit and verify they exist in the ODB
/// - Any objects previously encountered by this [`ConnectivityCheck`] instance will be skipped
/// - Any referenced blobs that are not present in the ODB will result in a call to the `missing_cb`
/// - Missing commits or trees will currently result in panic
/// Run the connectivity check on the provided commit `oid`.
///
/// ### Algorithm
///
/// Walk the trees and blobs referenced by the commit and verify they exist in the ODB.
/// Any objects previously encountered by this instance will be skipped silently.
/// Any referenced blobs that are not present in the ODB will result in a call to the `missing_cb`.
/// Missing commits or trees will cause an error to be returned.
/// - TODO: consider how to handle a missing commit (invoke `missing_cb`, or possibly return a Result?)
pub fn check_commit(&mut self, oid: &ObjectId) {
pub fn check_commit(&mut self, oid: &ObjectId) -> Result<(), gix_object::find::existing_object::Error> {
// Attempt to insert the commit ID in the set, and if already present, return immediately
if !self.oid_set.insert(*oid) {
return;
return Ok(());
}
// Obtain the commit's tree ID
let tree_id = {
let commit = self.db.find_commit(oid, &mut self.buf).expect("failed to find commit");
let mut buf = self.free_list.buf();
let commit = self.db.find_commit(oid, &mut buf)?;
commit.tree()
};

// Attempt to insert the tree ID in the set, and if already present, return immediately
if self.oid_set.insert(tree_id) {
self.check_tree(&tree_id);
check_tree(
&tree_id,
&self.db,
&mut self.free_list,
&mut self.missing_cb,
&mut self.oid_set,
);
}

Ok(())
}
}

fn check_tree(&mut self, oid: &ObjectId) {
let tree = match self.db.find_tree(oid, &mut self.buf) {
Ok(tree) => tree,
Err(_) => {
// Tree is missing, so invoke `missing_cb`
(self.missing_cb)(oid, Kind::Tree);
return;
}
};
#[derive(Default)]
struct FreeList(RefCell<Vec<Vec<u8>>>);

// Keeping separate sets for trees and blobs for now...
// This is about a wash when compared to using a HashMap<ObjectID, Kind>
struct TreeEntries {
trees: HashSet<ObjectId>,
blobs: HashSet<ObjectId>,
}
impl FreeList {
fn buf(&self) -> ReturnToFreeListOnDrop<'_> {
let buf = self.0.borrow_mut().pop().unwrap_or_default();
ReturnToFreeListOnDrop { buf, list: &self.0 }
}
}

// Build up a set of trees and a set of blobs
let entries: TreeEntries = {
let mut entries = TreeEntries {
trees: HashSet::default(),
blobs: HashSet::default(),
};

// For each entry in the tree
for entry_ref in tree.entries.iter() {
match entry_ref.mode {
EntryMode::Tree => {
let tree_id = entry_ref.oid.to_owned();
// Check if the tree has already been encountered
if self.oid_set.insert(tree_id) {
entries.trees.insert(tree_id);
}
}
EntryMode::Blob | EntryMode::BlobExecutable | EntryMode::Link => {
let blob_id = entry_ref.oid.to_owned();
// Check if the blob has already been encountered
if self.oid_set.insert(blob_id) {
entries.blobs.insert(blob_id);
}
}
EntryMode::Commit => {
// This implies a submodule (OID is the commit hash of the submodule)
// Skip it as it's not in this repository!
}
}
}
entries
};
struct ReturnToFreeListOnDrop<'a> {
list: &'a RefCell<Vec<Vec<u8>>>,
buf: Vec<u8>,
}

for tree_id in entries.trees.iter() {
self.check_tree(tree_id);
}
for blob_id in entries.blobs.iter() {
self.check_blob(blob_id);
impl Drop for ReturnToFreeListOnDrop<'_> {
fn drop(&mut self) {
if !self.buf.is_empty() {
self.list.borrow_mut().push(std::mem::take(&mut self.buf));
}
}
}

impl Deref for ReturnToFreeListOnDrop<'_> {
type Target = Vec<u8>;

fn deref(&self) -> &Self::Target {
&self.buf
}
}

impl DerefMut for ReturnToFreeListOnDrop<'_> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.buf
}
}

fn check_blob<F>(db: impl Exists, oid: &ObjectId, mut missing_cb: F)
where
F: FnMut(&ObjectId, Kind),
{
// Check if the blob is missing from the ODB
if !db.exists(oid) {
// Blob is missing, so invoke `missing_cb`
missing_cb(oid, Kind::Blob);
}
}

fn check_blob(&mut self, oid: &ObjectId) {
// Check if the blob is missing from the ODB
if !self.db.exists(oid) {
// Blob is missing, so invoke `missing_cb`
(self.missing_cb)(oid, Kind::Blob);
fn check_tree<F>(
oid: &ObjectId,
db: &(impl FindExt + Exists),
list: &FreeList,
missing_cb: &mut F,
oid_set: &mut HashSet,
) where
F: FnMut(&ObjectId, Kind),
{
let mut buf = list.buf();
let Ok(tree) = db.find_tree(oid, &mut buf) else {
missing_cb(oid, Kind::Tree);
return;
};

// Build up a set of trees and a set of blobs
// For each entry in the tree
for entry_ref in tree.entries.iter() {
match entry_ref.mode {
EntryMode::Tree => {
let tree_id = entry_ref.oid.to_owned();
if oid_set.insert(tree_id) {
check_tree(&tree_id, &*db, list, &mut *missing_cb, oid_set);
}
}
EntryMode::Blob | EntryMode::BlobExecutable | EntryMode::Link => {
let blob_id = entry_ref.oid.to_owned();
if oid_set.insert(blob_id) {
check_blob(&*db, &blob_id, &mut *missing_cb);
}
}
EntryMode::Commit => {
// This implies a submodule (OID is the commit hash of the submodule)
// Skip it as it's not in this repository!
}
}
}
}
Loading

0 comments on commit 7a88b42

Please sign in to comment.