Skip to content

Commit

Permalink
Add a nonstandard shallow clone for GitHub
Browse files Browse the repository at this point in the history
  • Loading branch information
Eh2406 committed Oct 7, 2022
1 parent 0b84a35 commit 3570ee7
Show file tree
Hide file tree
Showing 4 changed files with 182 additions and 59 deletions.
5 changes: 2 additions & 3 deletions src/cargo/core/source/source_id.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@ use crate::sources::{DirectorySource, CRATES_IO_DOMAIN, CRATES_IO_INDEX, CRATES_
use crate::sources::{GitSource, PathSource, RegistrySource};
use crate::util::{CanonicalUrl, CargoResult, Config, IntoUrl};
use log::trace;
use serde::de;
use serde::ser;
use serde::{de, ser, Serialize};
use std::cmp::{self, Ordering};
use std::collections::HashSet;
use std::fmt::{self, Formatter};
Expand Down Expand Up @@ -58,7 +57,7 @@ enum SourceKind {
}

/// Information to find a specific commit in a Git repository.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize)]
pub enum GitReference {
/// From a tag.
Tag(String),
Expand Down
2 changes: 1 addition & 1 deletion src/cargo/sources/git/source.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ impl<'cfg> Source for GitSource<'cfg> {
// database, then try to resolve our reference with the preexisting
// repository.
(None, Some(db)) if self.config.offline() => {
let rev = db.resolve(&self.manifest_reference).with_context(|| {
let rev = db.resolve_to_object(&self.manifest_reference).with_context(|| {
"failed to lookup reference in preexisting repository, and \
can't check for updates in offline mode (--offline)"
})?;
Expand Down
214 changes: 174 additions & 40 deletions src/cargo/sources/git/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,13 @@ use cargo_util::{paths, ProcessBuilder};
use curl::easy::List;
use git2::{self, ErrorClass, ObjectType, Oid};
use log::{debug, info};
use serde::ser;
use serde::Serialize;
use serde::{ser, Deserialize, Serialize};
use std::borrow::Cow;
use std::env;
use std::fmt;
use std::path::{Path, PathBuf};
use std::process::Command;
use std::str;
use std::str::{self, FromStr};
use std::time::{Duration, Instant};
use url::Url;

Expand All @@ -28,6 +27,17 @@ where
s.collect_str(t)
}

fn deserialize_str<'de, D, T>(deserializer: D) -> Result<T, D::Error>
where
T: FromStr,
<T as FromStr>::Err: fmt::Display,
D: serde::Deserializer<'de>,
{
let buf = String::deserialize(deserializer)?;

FromStr::from_str(&buf).map_err(serde::de::Error::custom)
}

pub struct GitShortID(git2::Buf);

impl GitShortID {
Expand Down Expand Up @@ -78,8 +88,25 @@ impl GitRemote {
&self.url
}

pub fn rev_for(&self, path: &Path, reference: &GitReference) -> CargoResult<git2::Oid> {
reference.resolve(&self.db_at(path)?.repo)
/// Finds the Oid associated with the reference. The result is guaranteed to be on disk.
/// But may not be the object the reference points to!
/// For example, the reference points to a Commit and this may return the Tree that commit points do.
pub fn rev_to_object_for(
&self,
path: &Path,
reference: &GitReference,
) -> CargoResult<git2::Oid> {
reference.resolve_to_object(&self.db_at(path)?.repo)
}

/// Finds the Oid of the Commit the reference points to. But the result may not be on disk!
/// For example, the reference points to a Commit and we have only cloned the Tree.
pub fn rev_to_commit_for(
&self,
path: &Path,
reference: &GitReference,
) -> CargoResult<git2::Oid> {
reference.resolve_to_commit(&self.db_at(path)?.repo)
}

pub fn checkout(
Expand All @@ -104,7 +131,7 @@ impl GitRemote {
}
}
None => {
if let Ok(rev) = reference.resolve(&db.repo) {
if let Ok(rev) = reference.resolve_to_object(&db.repo) {
return Ok((db, rev));
}
}
Expand All @@ -123,7 +150,7 @@ impl GitRemote {
.context(format!("failed to clone into: {}", into.display()))?;
let rev = match locked_rev {
Some(rev) => rev,
None => reference.resolve(&repo)?,
None => reference.resolve_to_object(&repo)?,
};

Ok((
Expand Down Expand Up @@ -179,13 +206,65 @@ impl GitDatabase {
self.repo.revparse_single(&oid.to_string()).is_ok()
}

pub fn resolve(&self, r: &GitReference) -> CargoResult<git2::Oid> {
r.resolve(&self.repo)
pub fn resolve_to_object(&self, r: &GitReference) -> CargoResult<git2::Oid> {
r.resolve_to_object(&self.repo)
}
}

#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
struct ShallowDataBlob {
#[serde(serialize_with = "serialize_str")]
#[serde(deserialize_with = "deserialize_str")]
tree: git2::Oid,
#[serde(serialize_with = "serialize_str")]
#[serde(deserialize_with = "deserialize_str")]
etag: git2::Oid,
}

#[test]
fn check_with_git_hub() {
panic!(
r#"nonstandard shallow clone may be worse than a full check out.
This test is here to make sure we do not merge until we have official signoff from GitHub"#
)
}

impl GitReference {
pub fn resolve(&self, repo: &git2::Repository) -> CargoResult<git2::Oid> {
/// Finds the Oid associated with the reference. The result is guaranteed to be on disk.
/// But may not be the object the reference points to!
/// For example, the reference points to a Commit and this may return the Tree that commit points do.
pub fn resolve_to_object(&self, repo: &git2::Repository) -> CargoResult<git2::Oid> {
// Check if Cargo has done a nonstandard shallow clone
if let Some(shallow_data) = self.find_shallow_blob(repo) {
Ok(shallow_data.tree)
} else {
self.resolve_by_git(repo)
}
}
/// Finds the Oid of the Commit the reference points to. But the result may not be on disk!
/// For example, the reference points to a Commit and we have only cloned the Tree.
pub fn resolve_to_commit(&self, repo: &git2::Repository) -> CargoResult<git2::Oid> {
// Check if Cargo has done a nonstandard shallow clone
if let Some(shallow_data) = self.find_shallow_blob(repo) {
return Ok(shallow_data.etag);
} else {
self.resolve_by_git(repo)
}
}

fn find_shallow_blob(&self, repo: &git2::Repository) -> Option<ShallowDataBlob> {
repo.find_reference(
&(format!(
"refs/cargo-{}",
serde_json::to_string(self).expect("why cant we make json of this")
)),
)
.ok()
.and_then(|re| re.peel_to_blob().ok())
.and_then(|blob| serde_json::from_slice(blob.content()).ok())
}

fn resolve_by_git(&self, repo: &git2::Repository) -> CargoResult<git2::Oid> {
let id = match self {
// Note that we resolve the named tag here in sync with where it's
// fetched into via `fetch` below.
Expand Down Expand Up @@ -707,10 +786,17 @@ fn reset(repo: &git2::Repository, obj: &git2::Object<'_>, config: &Config) -> Ca
opts.progress(|_, cur, max| {
drop(pb.tick(cur, max, ""));
});
debug!("doing reset");
repo.reset(obj, git2::ResetType::Hard, Some(&mut opts))?;
debug!("reset done");
Ok(())
if obj.as_tree().is_some() {
debug!("doing reset for Cargo nonstandard shallow clone");
repo.checkout_tree(obj, Some(&mut opts))?;
debug!("reset done");
Ok(())
} else {
debug!("doing reset");
repo.reset(obj, git2::ResetType::Hard, Some(&mut opts))?;
debug!("reset done");
Ok(())
}
}

pub fn with_fetch_options(
Expand Down Expand Up @@ -819,32 +905,44 @@ pub fn fetch(
// The `+` symbol on the refspec means to allow a forced (fast-forward)
// update which is needed if there is ever a force push that requires a
// fast-forward.
match reference {
// For branches and tags we can fetch simply one reference and copy it
// locally, no need to fetch other branches/tags.
GitReference::Branch(b) => {
refspecs.push(format!("+refs/heads/{0}:refs/remotes/origin/{0}", b));
}
GitReference::Tag(t) => {
refspecs.push(format!("+refs/tags/{0}:refs/remotes/origin/tags/{0}", t));
if let Some(oid_to_fetch) = oid_to_fetch {
// GitHub told us exactly the min needed to fetch. So we can go ahead and do a Cargo nonstandard shallow clone.
refspecs.push(format!("+{0}", oid_to_fetch));
} else {
// In some cases we have Cargo nonstandard shallow cloned this repo before, but cannot do it now.
// Mostly if GitHub is now rate limiting us. If so, remove the info about the shallow clone.
if let Ok(mut refe) = repo.find_reference(&format!(
"refs/cargo-{}",
serde_json::to_string(reference).expect("why cant we make json of this")
)) {
let _ = refe.delete();
}

GitReference::DefaultBranch => {
refspecs.push(String::from("+HEAD:refs/remotes/origin/HEAD"));
}
match reference {
// For branches and tags we can fetch simply one reference and copy it
// locally, no need to fetch other branches/tags.
GitReference::Branch(b) => {
refspecs.push(format!("+refs/heads/{0}:refs/remotes/origin/{0}", b));
}
GitReference::Tag(t) => {
refspecs.push(format!("+refs/tags/{0}:refs/remotes/origin/tags/{0}", t));
}

GitReference::Rev(rev) => {
if rev.starts_with("refs/") {
refspecs.push(format!("+{0}:{0}", rev));
} else if let Some(oid_to_fetch) = oid_to_fetch {
refspecs.push(format!("+{0}:refs/commit/{0}", oid_to_fetch));
} else {
// We don't know what the rev will point to. To handle this
// situation we fetch all branches and tags, and then we pray
// it's somewhere in there.
refspecs.push(String::from("+refs/heads/*:refs/remotes/origin/*"));
GitReference::DefaultBranch => {
refspecs.push(String::from("+HEAD:refs/remotes/origin/HEAD"));
tags = true;
}

GitReference::Rev(rev) => {
if rev.starts_with("refs/") {
refspecs.push(format!("+{0}:{0}", rev));
} else {
// We don't know what the rev will point to. To handle this
// situation we fetch all branches and tags, and then we pray
// it's somewhere in there.
refspecs.push(String::from("+refs/heads/*:refs/remotes/origin/*"));
refspecs.push(String::from("+HEAD:refs/remotes/origin/HEAD"));
tags = true;
}
}
}
}
Expand Down Expand Up @@ -1071,7 +1169,7 @@ fn github_fast_path(
return Ok(FastPathRev::Indeterminate);
}

let local_object = reference.resolve(repo).ok();
let local_object = reference.resolve_to_commit(repo).ok();

let github_branch_name = match reference {
GitReference::Branch(branch) => branch,
Expand Down Expand Up @@ -1141,7 +1239,7 @@ fn github_fast_path(
handle.useragent("cargo")?;
handle.http_headers({
let mut headers = List::new();
headers.append("Accept: application/vnd.github.3.sha")?;
headers.append("Accept: application/vnd.github+json")?;
if let Some(local_object) = local_object {
headers.append(&format!("If-None-Match: \"{}\"", local_object))?;
}
Expand All @@ -1161,8 +1259,44 @@ fn github_fast_path(
if response_code == 304 {
Ok(FastPathRev::UpToDate)
} else if response_code == 200 {
let oid_to_fetch = str::from_utf8(&response_body)?.parse::<Oid>()?;
Ok(FastPathRev::NeedsFetch(oid_to_fetch))
#[derive(Debug, Deserialize)]
struct GithubFastPathJsonResponse {
#[serde(serialize_with = "serialize_str")]
#[serde(deserialize_with = "deserialize_str")]
sha: git2::Oid,
commit: GithubCommitJsonResponse,
}

#[derive(Debug, Deserialize)]
struct GithubCommitJsonResponse {
tree: GithubTreeJsonResponse,
}

#[derive(Debug, Deserialize)]
struct GithubTreeJsonResponse {
#[serde(serialize_with = "serialize_str")]
#[serde(deserialize_with = "deserialize_str")]
sha: git2::Oid,
}

let data: GithubFastPathJsonResponse = serde_json::from_slice(&response_body)?;
// We can do a Cargo nonstandard shallow clone, so record the relevant information.
let bytes = serde_json::to_string(&ShallowDataBlob {
tree: data.commit.tree.sha,
etag: data.sha,
})
.expect("why cant we make json of this");
let shallow_blob = repo.blob(bytes.as_bytes())?;
repo.reference(
&format!(
"refs/cargo-{}",
serde_json::to_string(reference).expect("why cant we make json of this")
),
shallow_blob,
true,
"",
)?;
Ok(FastPathRev::NeedsFetch(data.commit.tree.sha))
} else {
// Usually response_code == 404 if the repository does not exist, and
// response_code == 422 if exists but GitHub is unable to resolve the
Expand Down
20 changes: 5 additions & 15 deletions src/cargo/sources/registry/remote.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ pub struct RemoteRegistry<'cfg> {
config: &'cfg Config,
tree: RefCell<Option<git2::Tree<'static>>>,
repo: LazyCell<git2::Repository>,
head: Cell<Option<git2::Oid>>,
current_sha: Cell<Option<InternedString>>,
needs_update: bool, // Does this registry need to be updated?
updated: bool, // Has this registry been updated this session?
Expand All @@ -46,7 +45,6 @@ impl<'cfg> RemoteRegistry<'cfg> {
index_git_ref: GitReference::DefaultBranch,
tree: RefCell::new(None),
repo: LazyCell::new(),
head: Cell::new(None),
current_sha: Cell::new(None),
needs_update: false,
updated: false,
Expand Down Expand Up @@ -96,15 +94,6 @@ impl<'cfg> RemoteRegistry<'cfg> {
})
}

fn head(&self) -> CargoResult<git2::Oid> {
if self.head.get().is_none() {
let repo = self.repo()?;
let oid = self.index_git_ref.resolve(repo)?;
self.head.set(Some(oid));
}
Ok(self.head.get().unwrap())
}

fn tree(&self) -> CargoResult<Ref<'_, git2::Tree<'_>>> {
{
let tree = self.tree.borrow();
Expand All @@ -113,8 +102,8 @@ impl<'cfg> RemoteRegistry<'cfg> {
}
}
let repo = self.repo()?;
let commit = repo.find_commit(self.head()?)?;
let tree = commit.tree()?;
let oid = self.index_git_ref.resolve_to_object(repo)?;
let tree = repo.find_object(oid, None)?.peel_to_tree()?;

// Unfortunately in libgit2 the tree objects look like they've got a
// reference to the repository object which means that a tree cannot
Expand All @@ -137,7 +126,9 @@ impl<'cfg> RemoteRegistry<'cfg> {
if let Some(sha) = self.current_sha.get() {
return Some(sha);
}
let sha = InternedString::new(&self.head().ok()?.to_string());
let repo = self.repo().ok()?;
let oid = self.index_git_ref.resolve_to_commit(repo).ok()?;
let sha = InternedString::new(&oid.to_string());
self.current_sha.set(Some(sha));
Some(sha)
}
Expand Down Expand Up @@ -277,7 +268,6 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> {
self.config.http()?;

self.prepare()?;
self.head.set(None);
*self.tree.borrow_mut() = None;
self.current_sha.set(None);
let path = self.config.assert_package_cache_locked(&self.index_path);
Expand Down

0 comments on commit 3570ee7

Please sign in to comment.