Skip to content

Commit

Permalink
Load chunk index of midx file (#279)
Browse files Browse the repository at this point in the history
  • Loading branch information
Byron committed Dec 20, 2021
1 parent b2d2ae2 commit fac8efa
Show file tree
Hide file tree
Showing 4 changed files with 168 additions and 5 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

152 changes: 152 additions & 0 deletions git-chunk/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,155 @@
//! See the [git documentation](https://github.com/git/git/blob/seen/Documentation/technical/chunk-format.txt) for details.
#![deny(unsafe_code)]
#![deny(rust_2018_idioms, missing_docs)]

/// An identifier to describe the kind of chunk, unique within a chunk file.
pub type Kind = u32;

/// A special value denoting the end of the chunk file table of contents.
pub const SENTINEL: Kind = 0;

///
pub mod file {
///
pub mod index {
use crate::file::Index;
use std::ops::Range;

/// An entry of a chunk file index
pub struct Entry {
/// The kind of the chunk file
pub kind: crate::Kind,
/// The offset, relative to the beginning of the file, at which to find the chunk and its end.
pub offset: Range<crate::file::Offset>,
}

impl Index {
/// The size of a single index entry in bytes
pub const ENTRY_SIZE: usize = std::mem::size_of::<u32>() + std::mem::size_of::<u64>();
/// The smallest possible size of an index, consisting only of the sentinel value pointing past itself.
pub const EMPTY_SIZE: usize = Index::ENTRY_SIZE;

/// Find a chunk of `kind` and return its offset into the data if found
pub fn offset_by_kind(&self, kind: crate::Kind) -> Option<Range<crate::file::Offset>> {
self.chunks
.iter()
.find_map(|c| (c.kind == kind).then(|| c.offset.clone()))
}
}
}

/// The offset to a chunk as seen relative to the beginning of the file containing it.
pub type Offset = u64;

/// A chunk file providing a table into the parent data.
pub struct Index {
/// Validated chunks as defined by their index entries.
pub chunks: Vec<index::Entry>,
}
///
pub mod decode {
pub use error::Error;
use std::convert::TryInto;
use std::ops::Range;

mod error {
use quick_error::quick_error;
quick_error! {
/// The value returned by [crate::FileRef::from_bytes()
#[derive(Debug)]
#[allow(missing_docs)]
pub enum Error {
EarlySentinelValue {
display("Sentinel value encountered while still processing chunks.")
}
MissingSentinelValue { actual: crate::Kind } {
display("Sentinel value wasn't found, saw {:#016x}", actual)
}
ChunkSizeOutOfBounds { offset: crate::file::Offset, file_length: u64 } {
display("The chunk offset {} went past the file of length {} - was it truncated?", offset, file_length)
}
DuplicateChunk(kind: crate::Kind) {
display("The chunk of kind {:#016x} was encountered more than once", kind)
}
TocTooSmall { actual: usize, expected: usize } {
display("The table of contents would be {} bytes, but got only {}", expected, actual)
}
Empty {
display("Empty chunk indices are not allowed as the point of chunked files is to have chunks.")
}
}
}
}
use crate::file;
use crate::file::index;

impl file::Index {
/// Provided a mapped file at the beginning via `data`, starting at `toc_offset` decode all chunk information to return
/// an index with `num_chunks` chunks.
pub fn from_bytes(data: &[u8], toc_offset: usize, num_chunks: u32) -> Result<Self, Error> {
if num_chunks == 0 {
return Err(Error::Empty);
}

let data_len: u64 = data.len() as u64;
let mut chunks = Vec::with_capacity(num_chunks as usize);
let mut toc_entry = &data[toc_offset..];
let expected_min_size = (num_chunks as usize + 1) * file::Index::ENTRY_SIZE;
if toc_entry.len() < expected_min_size {
return Err(Error::TocTooSmall {
expected: expected_min_size,
actual: toc_entry.len(),
});
}

for _ in 0..num_chunks {
let (kind, offset) = toc_entry.split_at(4);
let kind = be_u32(kind);
if kind == crate::SENTINEL {
return Err(Error::EarlySentinelValue);
}
if chunks.iter().any(|c: &index::Entry| c.kind == kind) {
return Err(Error::DuplicateChunk(kind));
}

let offset = be_u64(offset);
if offset > data_len {
return Err(Error::ChunkSizeOutOfBounds {
offset,
file_length: data_len,
});
}
toc_entry = &toc_entry[file::Index::ENTRY_SIZE..];
let next_offset = be_u64(&toc_entry[4..]);
if next_offset > data_len {
return Err(Error::ChunkSizeOutOfBounds {
offset: next_offset,
file_length: data_len,
});
}
chunks.push(index::Entry {
kind,
offset: Range {
start: offset,
end: next_offset,
},
})
}

let sentinel = be_u32(&toc_entry[..4]);
if sentinel != crate::SENTINEL {
return Err(Error::MissingSentinelValue { actual: sentinel });
}

Ok(file::Index { chunks })
}
}

fn be_u32(data: &[u8]) -> u32 {
u32::from_be_bytes(data[..4].try_into().unwrap())
}
fn be_u64(data: &[u8]) -> u64 {
u64::from_be_bytes(data[..8].try_into().unwrap())
}
}
}
1 change: 1 addition & 0 deletions git-pack/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ all-features = true
[dependencies]
git-features = { version ="^0.18.0", path = "../git-features", features = ["crc32", "rustsha1", "progress", "zlib"] }
git-hash = { version ="^0.8.0", path = "../git-hash" }
git-chunk = { version ="^0.0.0", path = "../git-chunk" }
git-object = { version ="^0.16.0", path = "../git-object" }
git-traverse = { version ="^0.11.0", path = "../git-traverse" }
git-diff = { version ="^0.12.0", path = "../git-diff" }
Expand Down
19 changes: 14 additions & 5 deletions git-pack/src/multi_index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ pub mod init {
UnsupportedVersion { version: u8 },
#[error("Unsupported hash kind: {kind})")]
UnsupportedHashKind { kind: u8 },
#[error(transparent)]
ChunkFileDecode(#[from] git_chunk::file::decode::Error),
}
}
pub use error::Error;
Expand All @@ -82,15 +84,20 @@ pub mod init {
path: path.to_owned(),
})?;

const HEADER_LEN: usize = (4 /*signature*/ + 1 /*version*/ + 1 /*object id version*/ + 1 /* num chunks */ + 1/* num base files */ + 4/*num pack files*/);
const HEADER_LEN: usize = 4 /*signature*/ +
1 /*version*/ +
1 /*object id version*/ +
1 /*num chunks */ +
1 /*num base files */ +
4 /*num pack files*/;
const TRAILER_LEN: usize = git_hash::Kind::longest().len_in_bytes(); /* trailing hash */
if data.len() < HEADER_LEN + TRAILER_LEN {
if data.len() < HEADER_LEN + git_chunk::file::Index::EMPTY_SIZE + TRAILER_LEN {
return Err(Error::Corrupt {
message: "multi-index file is truncated and too short".into(),
});
}

let (version, hash_kind, num_chunks, num_packs, toc) = {
let (version, hash_kind, num_chunks, num_packs) = {
let (signature, data) = data.split_at(4);
if signature != b"MIDX" {
return Err(Error::Corrupt {
Expand All @@ -114,12 +121,14 @@ pub mod init {

let (_num_base_files, data) = data.split_at(1); // TODO: handle base files once it's clear what this does

let (num_packs, toc) = data.split_at(4);
let (num_packs, _) = data.split_at(4);
let num_packs = BigEndian::read_u32(num_packs);

(version, hash_kind, num_chunks, num_packs, toc)
(version, hash_kind, num_chunks, num_packs)
};

let chunks = git_chunk::file::Index::from_bytes(&data, HEADER_LEN, num_chunks as u32)?;

Ok(File {
data,
path: path.to_owned(),
Expand Down

0 comments on commit fac8efa

Please sign in to comment.