Skip to content

Commit

Permalink
multi-pack index writing complete with large-offset support (#279)
Browse files Browse the repository at this point in the history
  • Loading branch information
Byron committed Dec 31, 2021
1 parent bfc8069 commit f7d5c7f
Show file tree
Hide file tree
Showing 8 changed files with 95 additions and 32 deletions.
6 changes: 3 additions & 3 deletions git-chunk/src/file/write.rs
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
#![allow(missing_docs, unused)]
use crate::file::index::Entry;
use crate::file::Index;
use crate::file::{index::Entry, Index};

enum State {
Collecting,
WriteChunks,
}

mod write_chunk {
use crate::file::index;
use std::collections::VecDeque;

use crate::file::index;

pub struct Chunk<W> {
chunks_to_write: VecDeque<index::Entry>,
inner: W,
Expand Down
3 changes: 2 additions & 1 deletion git-hash/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@

mod borrowed;

pub use borrowed::oid;
use std::convert::TryFrom;

pub use borrowed::oid;

mod owned;
pub use owned::ObjectId;

Expand Down
3 changes: 1 addition & 2 deletions git-pack/src/index/traverse/indexed.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,9 @@ use std::{
use git_features::{parallel, progress::Progress};

use super::{Error, SafetyCheck};
use crate::index::traverse::Outcome;
use crate::{
cache::delta::traverse::Context,
index::{self, util::index_entries_sorted_by_offset_ascending},
index::{self, traverse::Outcome, util::index_entries_sorted_by_offset_ascending},
};

/// Traversal with index
Expand Down
3 changes: 2 additions & 1 deletion git-pack/src/index/traverse/with_lookup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,11 @@ mod options {
}
use std::sync::atomic::Ordering;

use crate::index::traverse::Outcome;
use git_features::threading::{lock, Mutable, OwnShared};
pub use options::Options;

use crate::index::traverse::Outcome;

/// Verify and validate the content of the index file
impl index::File {
/// Iterate through all _decoded objects_ in the given `pack` and handle them with a `Processor` using a cache to reduce the amount of
Expand Down
53 changes: 45 additions & 8 deletions git-pack/src/multi_index/chunk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,12 @@ pub mod index_names {

/// Information for the chunk with the fanout table
pub mod fanout {
use crate::multi_index;
use byteorder::{BigEndian, WriteBytesExt};
use std::convert::TryInto;

use byteorder::{BigEndian, WriteBytesExt};

use crate::multi_index;

/// The size of the fanout table
pub const SIZE: usize = 4 * 256;

Expand Down Expand Up @@ -144,9 +146,10 @@ pub mod fanout {

/// Information about the oid lookup table.
pub mod lookup {
use crate::multi_index;
use std::ops::Range;

use crate::multi_index;

/// The id uniquely identifying the oid lookup table.
pub const ID: git_chunk::Id = *b"OIDL";

Expand All @@ -173,10 +176,12 @@ pub mod lookup {

/// Information about the offsets table.
pub mod offsets {
use crate::multi_index;
use byteorder::{BigEndian, WriteBytesExt};
use std::ops::Range;

use byteorder::{BigEndian, WriteBytesExt};

use crate::multi_index;

/// The id uniquely identifying the offsets table.
pub const ID: git_chunk::Id = *b"OOFF";

Expand All @@ -185,11 +190,10 @@ pub mod offsets {
(entries * (4 /*pack-id*/ + 4/* pack offset */)) as u64
}

/// Returns the amount of entries that need a u64 offset.
pub(crate) fn write(
sorted_entries: &[multi_index::write::Entry],
mut out: impl std::io::Write,
) -> std::io::Result<u32> {
) -> std::io::Result<()> {
use crate::index::write::encode::{HIGH_BIT, LARGE_OFFSET_THRESHOLD};
let mut num_large_offsets = 0u32;

Expand All @@ -205,7 +209,7 @@ pub mod offsets {
};
out.write_u32::<BigEndian>(offset)?;
}
Ok(num_large_offsets)
Ok(())
}

/// Returns true if the `offset` range seems to match the size required for `num_objects`.
Expand All @@ -219,11 +223,44 @@ pub mod offsets {
pub mod large_offsets {
use std::ops::Range;

use byteorder::{BigEndian, WriteBytesExt};

use crate::{index::write::encode::LARGE_OFFSET_THRESHOLD, multi_index};

/// The id uniquely identifying the large offsets table (with 64 bit offsets)
pub const ID: git_chunk::Id = *b"LOFF";

pub(crate) fn num_large_offsets(entries: &[multi_index::write::Entry]) -> usize {
entries
.iter()
.filter(|e| e.pack_offset > LARGE_OFFSET_THRESHOLD)
.count()
}
/// Returns true if the `offsets` range seems to be properly aligned for the data we expect.
pub fn is_valid(offset: &Range<usize>) -> bool {
(offset.end - offset.start) % 8 == 0
}

pub(crate) fn write(
sorted_entries: &[multi_index::write::Entry],
num_large_offsets: usize,
mut out: impl std::io::Write,
) -> std::io::Result<()> {
for offset in sorted_entries
.iter()
.filter_map(|e| (e.pack_offset > LARGE_OFFSET_THRESHOLD).then(|| e.pack_offset))
{
out.write_u64::<BigEndian>(offset)?;
num_large_offsets
.checked_sub(1)
.expect("BUG: wrote more offsets the previously found");
}
assert_eq!(num_large_offsets, 0, "BUG: wrote less offsets than initially counted");
Ok(())
}

/// Return the amount of bytes needed to store the given amount of `large_offsets`
pub(crate) fn storage_size(large_offsets: usize) -> u64 {
8 * large_offsets as u64
}
}
3 changes: 1 addition & 2 deletions git-pack/src/multi_index/verify.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use std::cmp::Ordering;
use std::sync::atomic::AtomicBool;
use std::{cmp::Ordering, sync::atomic::AtomicBool};

use git_features::progress::Progress;

Expand Down
30 changes: 21 additions & 9 deletions git-pack/src/multi_index/write.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
#![allow(missing_docs, unused)]

use crate::multi_index;
use std::{
convert::TryInto,
io::Write,
path::PathBuf,
sync::atomic::AtomicBool,
time::{Instant, SystemTime},
};

use byteorder::{BigEndian, WriteBytesExt};
use git_features::progress::Progress;
use std::convert::TryInto;
use std::io::Write;
use std::path::PathBuf;
use std::sync::atomic::AtomicBool;
use std::time::{Instant, SystemTime};

use crate::multi_index;

mod error {
/// The error returned by [multi_index::File::write_from_index_paths()][super::multi_index::File::write_from_index_paths()]..
Expand Down Expand Up @@ -125,23 +129,31 @@ impl multi_index::File {
multi_index::chunk::offsets::storage_size(entries.len()),
);

let num_large_offsets = multi_index::chunk::large_offsets::num_large_offsets(&entries);
if num_large_offsets > 0 {
cf.plan_chunk(
multi_index::chunk::large_offsets::ID,
multi_index::chunk::large_offsets::storage_size(num_large_offsets),
);
}

let bytes_written = Self::write_header(
&mut out,
cf.num_chunks().try_into().expect("BUG: wrote more than 256 chunks"),
index_paths_sorted.len() as u32,
object_hash,
)?;
let mut chunk_write = cf.into_write(&mut out, bytes_written)?;
let mut num_large_offsets = None;
while let Some(chunk_to_write) = chunk_write.next_chunk() {
match chunk_to_write {
multi_index::chunk::index_names::ID => {
multi_index::chunk::index_names::write(&index_filenames_sorted, &mut chunk_write)?
}
multi_index::chunk::fanout::ID => multi_index::chunk::fanout::write(&entries, &mut chunk_write)?,
multi_index::chunk::lookup::ID => multi_index::chunk::lookup::write(&entries, &mut chunk_write)?,
multi_index::chunk::offsets::ID => {
num_large_offsets = multi_index::chunk::offsets::write(&entries, &mut chunk_write)?.into();
multi_index::chunk::offsets::ID => multi_index::chunk::offsets::write(&entries, &mut chunk_write)?,
multi_index::chunk::large_offsets::ID => {
multi_index::chunk::large_offsets::write(&entries, num_large_offsets, &mut chunk_write)?
}
unknown => unreachable!("BUG: forgot to implement chunk {:?}", std::str::from_utf8(&unknown)),
}
Expand Down
26 changes: 20 additions & 6 deletions git-pack/tests/pack/multi_index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -112,10 +112,10 @@ mod verify {
}

mod write {
use std::{path::PathBuf, sync::atomic::AtomicBool};

use git_features::progress;
use git_testtools::{fixture_path, hex_to_id};
use std::path::PathBuf;
use std::sync::atomic::AtomicBool;

#[test]
fn from_paths() {
Expand All @@ -136,7 +136,7 @@ mod write {
.open(&output_path)
.unwrap();
let outcome = git_pack::multi_index::File::write_from_index_paths(
input_indices,
input_indices.clone(),
&mut out,
progress::Discard,
&AtomicBool::new(false),
Expand All @@ -148,14 +148,28 @@ mod write {

assert_eq!(
outcome.multi_index_checksum,
hex_to_id("dddddddddddddddddddddddddddddddddddddddd")
hex_to_id("d34d327039a3554f8a644b29e07b903fa71ef269")
);

let file = git_pack::multi_index::File::at(output_path).unwrap();
assert_eq!(file.num_indices(), 3);
assert_eq!(file.index_names(), vec![PathBuf::from("hello.idx")]);
assert_eq!(file.num_objects(), 42);
assert_eq!(
file.index_names(),
vec![
PathBuf::from("pack-11fdfa9e156ab73caae3b6da867192221f2089c2.idx"),
PathBuf::from("pack-a2bf8e71d8c18879e499335762dd95119d93d9f1.idx"),
PathBuf::from("pack-c0438c19fb16422b6bbcce24387b3264416d485b.idx"),
]
);
assert_eq!(file.num_objects(), 139);
assert_eq!(file.checksum(), outcome.multi_index_checksum);

for index in &input_indices {
std::fs::copy(index, dir.path().join(index.file_name().expect("present"))).unwrap();
let pack = index.with_extension("pack");
std::fs::copy(&pack, dir.path().join(pack.file_name().expect("present"))).unwrap();
}

assert_eq!(
file.verify_integrity(
|| git_pack::cache::Never,
Expand Down

0 comments on commit f7d5c7f

Please sign in to comment.