From 15cc376b90c1dcc54cc4959f2e621cb8fa6de6f6 Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Wed, 10 May 2017 15:00:33 -0700 Subject: [PATCH] Don't check out the crates.io index locally This commit moves working with the crates.io index to operating on the git object layers rather than actually literally checking out the index. This is aimed at two different goals: * Improving the on-disk file size of the registry * Improving cloning times for the registry as the index doesn't need to be checked out The on disk size of my `registry` folder of a fresh check out of the index went form 124M to 48M, saving a good chunk of space! The entire operation took about 0.6s less on a Unix machine (out of 4.7s total for current Cargo). On Windows, however, the clone operation went from 11s to 6.7s, a much larger improvement! Closes #4015 --- src/cargo/sources/registry/index.rs | 85 ++++++++------ src/cargo/sources/registry/local.rs | 9 +- src/cargo/sources/registry/mod.rs | 13 ++- src/cargo/sources/registry/remote.rs | 161 +++++++++++++++++++-------- 4 files changed, 177 insertions(+), 91 deletions(-) diff --git a/src/cargo/sources/registry/index.rs b/src/cargo/sources/registry/index.rs index bbd7e226ca7..026f20204a9 100644 --- a/src/cargo/sources/registry/index.rs +++ b/src/cargo/sources/registry/index.rs @@ -1,14 +1,15 @@ use std::collections::HashMap; -use std::io::prelude::*; -use std::fs::File; use std::path::Path; +use std::str; use serde_json; use core::dependency::{Dependency, DependencyInner, Kind}; use core::{SourceId, Summary, PackageId, Registry}; use sources::registry::{RegistryPackage, RegistryDependency, INDEX_LOCK}; +use sources::registry::RegistryData; use util::{CargoResult, ChainError, internal, Filesystem, Config}; +use util::human; pub struct RegistryIndex<'cfg> { source_id: SourceId, @@ -23,7 +24,8 @@ impl<'cfg> RegistryIndex<'cfg> { pub fn new(id: &SourceId, path: &Filesystem, config: &'cfg Config, - locked: bool) -> RegistryIndex<'cfg> { + locked: bool) + -> RegistryIndex<'cfg> { RegistryIndex { source_id: id.clone(), path: path.clone(), @@ -35,13 +37,16 @@ impl<'cfg> RegistryIndex<'cfg> { } /// Return the hash listed for a specified PackageId. - pub fn hash(&mut self, pkg: &PackageId) -> CargoResult { + pub fn hash(&mut self, + pkg: &PackageId, + load: &mut RegistryData) + -> CargoResult { let key = (pkg.name().to_string(), pkg.version().to_string()); if let Some(s) = self.hashes.get(&key) { return Ok(s.clone()) } // Ok, we're missing the key, so parse the index file to load it. - self.summaries(pkg.name())?; + self.summaries(pkg.name(), load)?; self.hashes.get(&key).chain_error(|| { internal(format!("no hash listed for {}", pkg)) }).map(|s| s.clone()) @@ -51,11 +56,14 @@ impl<'cfg> RegistryIndex<'cfg> { /// /// Returns a list of pairs of (summary, yanked) for the package name /// specified. - pub fn summaries(&mut self, name: &str) -> CargoResult<&Vec<(Summary, bool)>> { + pub fn summaries(&mut self, + name: &str, + load: &mut RegistryData) + -> CargoResult<&Vec<(Summary, bool)>> { if self.cache.contains_key(name) { return Ok(&self.cache[name]); } - let summaries = self.load_summaries(name)?; + let summaries = self.load_summaries(name, load)?; let summaries = summaries.into_iter().filter(|summary| { summary.0.package_id().name() == name }).collect(); @@ -63,8 +71,11 @@ impl<'cfg> RegistryIndex<'cfg> { Ok(&self.cache[name]) } - fn load_summaries(&mut self, name: &str) -> CargoResult> { - let (path, _lock) = if self.locked { + fn load_summaries(&mut self, + name: &str, + load: &mut RegistryData) + -> CargoResult> { + let (root, _lock) = if self.locked { let lock = self.path.open_ro(Path::new(INDEX_LOCK), self.config, "the registry index"); @@ -84,25 +95,32 @@ impl<'cfg> RegistryIndex<'cfg> { // see module comment for why this is structured the way it is let path = match fs_name.len() { - 1 => path.join("1").join(&fs_name), - 2 => path.join("2").join(&fs_name), - 3 => path.join("3").join(&fs_name[..1]).join(&fs_name), - _ => path.join(&fs_name[0..2]) - .join(&fs_name[2..4]) - .join(&fs_name), + 1 => format!("1/{}", fs_name), + 2 => format!("2/{}", fs_name), + 3 => format!("3/{}/{}", &fs_name[..1], fs_name), + _ => format!("{}/{}/{}", &fs_name[0..2], &fs_name[2..4], fs_name), + // 1 => Path::new("1").join(fs_name), + // 2 => Path::new("2").join(fs_name), + // 3 => Path::new("3").join(&fs_name[..1]).join(fs_name), + // _ => Path::new(&fs_name[0..2]).join(&fs_name[2..4]).join(fs_name), }; - match File::open(&path) { - Ok(mut f) => { - let mut contents = String::new(); - f.read_to_string(&mut contents)?; - let ret: CargoResult>; - ret = contents.lines().filter(|l| !l.trim().is_empty()) - .map(|l| self.parse_registry_package(l)) - .collect(); - ret.chain_error(|| { - internal(format!("failed to parse registry's information \ - for: {}", name)) - }) + match load.load(&root, Path::new(&path)) { + Ok(contents) => { + let contents = str::from_utf8(&contents).map_err(|_| { + human("registry index file was not valid utf-8") + })?; + let lines = contents.lines() + .map(|s| s.trim()) + .filter(|l| !l.is_empty()); + + // Attempt forwards-compatibility on the index by ignoring + // everything that we ourselves don't understand, that should + // allow future cargo implementations to break the + // interpretation of each line here and older cargo will simply + // ignore the new lines. + Ok(lines.filter_map(|line| { + self.parse_registry_package(line).ok() + }).collect()) } Err(..) => Ok(Vec::new()), } @@ -161,12 +179,13 @@ impl<'cfg> RegistryIndex<'cfg> { .set_kind(kind) .into_dependency()) } -} -impl<'cfg> Registry for RegistryIndex<'cfg> { - fn query(&mut self, dep: &Dependency) -> CargoResult> { + pub fn query(&mut self, + dep: &Dependency, + load: &mut RegistryData) + -> CargoResult> { let mut summaries = { - let summaries = self.summaries(dep.name())?; + let summaries = self.summaries(dep.name(), load)?; summaries.iter().filter(|&&(_, yanked)| { dep.source_id().precise().is_some() || !yanked }).map(|s| s.0.clone()).collect::>() @@ -188,8 +207,4 @@ impl<'cfg> Registry for RegistryIndex<'cfg> { }); summaries.query(dep) } - - fn supports_checksums(&self) -> bool { - true - } } diff --git a/src/cargo/sources/registry/local.rs b/src/cargo/sources/registry/local.rs index 67e88a9c72d..fa4cbcd82d1 100644 --- a/src/cargo/sources/registry/local.rs +++ b/src/cargo/sources/registry/local.rs @@ -6,8 +6,9 @@ use rustc_serialize::hex::ToHex; use core::PackageId; use sources::registry::{RegistryData, RegistryConfig}; -use util::{Config, CargoResult, ChainError, human, Sha256, Filesystem}; use util::FileLock; +use util::paths; +use util::{Config, CargoResult, ChainError, human, Sha256, Filesystem}; pub struct LocalRegistry<'cfg> { index_path: Filesystem, @@ -34,7 +35,11 @@ impl<'cfg> RegistryData for LocalRegistry<'cfg> { &self.index_path } - fn config(&self) -> CargoResult> { + fn load(&self, root: &Path, path: &Path) -> CargoResult> { + paths::read_bytes(&root.join(path)) + } + + fn config(&mut self) -> CargoResult> { // Local registries don't have configuration for remote APIs or anything // like that Ok(None) diff --git a/src/cargo/sources/registry/mod.rs b/src/cargo/sources/registry/mod.rs index 2044c30ebfe..2af4ad9719f 100644 --- a/src/cargo/sources/registry/mod.rs +++ b/src/cargo/sources/registry/mod.rs @@ -219,7 +219,8 @@ struct RegistryDependency { pub trait RegistryData { fn index_path(&self) -> &Filesystem; - fn config(&self) -> CargoResult>; + fn load(&self, root: &Path, path: &Path) -> CargoResult>; + fn config(&mut self) -> CargoResult>; fn update_index(&mut self) -> CargoResult<()>; fn download(&mut self, pkg: &PackageId, @@ -274,7 +275,7 @@ impl<'cfg> RegistrySource<'cfg> { /// Decode the configuration stored within the registry. /// /// This requires that the index has been at least checked out. - pub fn config(&self) -> CargoResult> { + pub fn config(&mut self) -> CargoResult> { self.ops.config() } @@ -323,12 +324,12 @@ impl<'cfg> Registry for RegistrySource<'cfg> { // come back with no summaries, then our registry may need to be // updated, so we fall back to performing a lazy update. if dep.source_id().precise().is_some() && !self.updated { - if self.index.query(dep)?.is_empty() { + if self.index.query(dep, &mut *self.ops)?.is_empty() { self.do_update()?; } } - self.index.query(dep) + self.index.query(dep, &mut *self.ops) } fn supports_checksums(&self) -> bool { @@ -356,7 +357,7 @@ impl<'cfg> Source for RegistrySource<'cfg> { } fn download(&mut self, package: &PackageId) -> CargoResult { - let hash = self.index.hash(package)?; + let hash = self.index.hash(package, &mut *self.ops)?; let path = self.ops.download(package, &hash)?; let path = self.unpack_package(package, &path).chain_error(|| { internal(format!("failed to unpack package `{}`", package)) @@ -369,7 +370,7 @@ impl<'cfg> Source for RegistrySource<'cfg> { // differ due to historical Cargo bugs. To paper over these we trash the // *summary* loaded from the Cargo.toml we just downloaded with the one // we loaded from the index. - let summaries = self.index.summaries(package.name())?; + let summaries = self.index.summaries(package.name(), &mut *self.ops)?; let summary = summaries.iter().map(|s| &s.0).find(|s| { s.package_id() == package }).expect("summary not found"); diff --git a/src/cargo/sources/registry/remote.rs b/src/cargo/sources/registry/remote.rs index 7408eb45060..78c9cb5b5fa 100644 --- a/src/cargo/sources/registry/remote.rs +++ b/src/cargo/sources/registry/remote.rs @@ -1,5 +1,7 @@ +use std::cell::{RefCell, Ref, Cell}; use std::io::SeekFrom; use std::io::prelude::*; +use std::mem; use std::path::Path; use curl::easy::{Easy, List}; @@ -13,8 +15,7 @@ use ops; use sources::git; use sources::registry::{RegistryData, RegistryConfig, INDEX_LOCK}; use util::network; -use util::paths; -use util::{FileLock, Filesystem}; +use util::{FileLock, Filesystem, LazyCell}; use util::{Config, CargoResult, ChainError, human, Sha256, ToUrl}; use util::errors::HttpError; @@ -23,7 +24,10 @@ pub struct RemoteRegistry<'cfg> { cache_path: Filesystem, source_id: SourceId, config: &'cfg Config, - handle: Option, + handle: LazyCell>, + tree: RefCell>>, + repo: LazyCell, + head: Cell>, } impl<'cfg> RemoteRegistry<'cfg> { @@ -34,9 +38,74 @@ impl<'cfg> RemoteRegistry<'cfg> { cache_path: config.registry_cache_path().join(name), source_id: source_id.clone(), config: config, - handle: None, + tree: RefCell::new(None), + handle: LazyCell::new(), + repo: LazyCell::new(), + head: Cell::new(None), } } + + fn easy(&self) -> CargoResult<&RefCell> { + self.handle.get_or_try_init(|| { + ops::http_handle(self.config).map(RefCell::new) + }) + } + + fn repo(&self) -> CargoResult<&git2::Repository> { + self.repo.get_or_try_init(|| { + let path = self.index_path.clone().into_path_unlocked(); + + match git2::Repository::open(&path) { + Ok(repo) => Ok(repo), + Err(_) => { + self.index_path.create_dir()?; + let lock = self.index_path.open_rw(Path::new(INDEX_LOCK), + self.config, + "the registry index")?; + let _ = lock.remove_siblings(); + Ok(git2::Repository::init_bare(&path)?) + } + } + }) + } + + fn head(&self) -> CargoResult { + if self.head.get().is_none() { + let oid = self.repo()?.refname_to_id("refs/remotes/origin/master")?; + self.head.set(Some(oid)); + } + Ok(self.head.get().unwrap()) + } + + fn tree(&self) -> CargoResult> { + { + let tree = self.tree.borrow(); + if tree.is_some() { + return Ok(Ref::map(tree, |s| s.as_ref().unwrap())) + } + } + let repo = self.repo()?; + let commit = repo.find_commit(self.head()?)?; + let tree = commit.tree()?; + + // Unfortunately in libgit2 the tree objects look like they've got a + // reference to the repository object which means that a tree cannot + // outlive the repository that it came from. Here we want to cache this + // tree, though, so to accomplish this we transmute it to a static + // lifetime. + // + // Note that we don't actually hand out the static lifetime, instead we + // only return a scoped one from this function. Additionally the repo + // we loaded from (above) lives as long as this object + // (`RemoteRegistry`) so we then just need to ensure that the tree is + // destroyed first in the destructor, hence the destructor on + // `RemoteRegistry` below. + let tree = unsafe { + mem::transmute::>(tree) + }; + *self.tree.borrow_mut() = Some(tree); + Ok(Ref::map(self.tree.borrow(), |s| s.as_ref().unwrap())) + } } impl<'cfg> RegistryData for RemoteRegistry<'cfg> { @@ -44,13 +113,28 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> { &self.index_path } - fn config(&self) -> CargoResult> { - let lock = self.index_path.open_ro(Path::new(INDEX_LOCK), - self.config, - "the registry index")?; - let path = lock.path().parent().unwrap(); - let contents = paths::read(&path.join("config.json"))?; - let config = serde_json::from_str(&contents)?; + fn load(&self, _root: &Path, path: &Path) -> CargoResult> { + // Note that the index calls this method and the filesystem is locked + // in the index, so we don't need to worry about an `update_index` + // happening in a different process. + let repo = self.repo()?; + let tree = self.tree()?; + let entry = tree.get_path(path)?; + let object = entry.to_object(&repo)?; + let blob = match object.as_blob() { + Some(blob) => blob, + None => bail!("path `{}` is not a blob in the git repo", path.display()), + }; + Ok(blob.content().to_vec()) + } + + fn config(&mut self) -> CargoResult> { + self.repo()?; // create intermediate dirs and initialize the repo + let _lock = self.index_path.open_ro(Path::new(INDEX_LOCK), + self.config, + "the registry index")?; + let json = self.load(Path::new(""), Path::new("config.json"))?; + let config = serde_json::from_slice(&json)?; Ok(Some(config)) } @@ -63,53 +147,33 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> { // hit the index, which may not actually read this configuration. ops::http_handle(self.config)?; - // Then we actually update the index - self.index_path.create_dir()?; - let lock = self.index_path.open_rw(Path::new(INDEX_LOCK), - self.config, - "the registry index")?; - let path = lock.path().parent().unwrap(); - + let repo = self.repo()?; + let _lock = self.index_path.open_rw(Path::new(INDEX_LOCK), + self.config, + "the registry index")?; self.config.shell().status("Updating", format!("registry `{}`", self.source_id.url()))?; - let repo = git2::Repository::open(path).or_else(|_| { - let _ = lock.remove_siblings(); - git2::Repository::init(path) - })?; - if self.source_id.url().host_str() == Some("github.com") { - if let Ok(oid) = repo.refname_to_id("refs/heads/master") { - let handle = match self.handle { - Some(ref mut handle) => handle, - None => { - self.handle = Some(ops::http_handle(self.config)?); - self.handle.as_mut().unwrap() - } - }; + if let Ok(oid) = self.head() { + let mut handle = self.easy()?.borrow_mut(); debug!("attempting github fast path for {}", self.source_id.url()); - if github_up_to_date(handle, self.source_id.url(), &oid) { + if github_up_to_date(&mut handle, self.source_id.url(), &oid) { return Ok(()) } debug!("fast path failed, falling back to a git fetch"); } } - // git fetch origin + // git fetch origin master let url = self.source_id.url().to_string(); let refspec = "refs/heads/master:refs/remotes/origin/master"; - git::fetch(&repo, &url, refspec, self.config).chain_error(|| { human(format!("failed to fetch `{}`", url)) })?; - - // git reset --hard origin/master - let reference = "refs/remotes/origin/master"; - let oid = repo.refname_to_id(reference)?; - trace!("[{}] updating to rev {}", self.source_id, oid); - let object = repo.find_object(oid, None)?; - repo.reset(&object, git2::ResetType::Hard, None)?; + self.head.set(None); + *self.tree.borrow_mut() = None; Ok(()) } @@ -144,17 +208,11 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> { .push(&pkg.version().to_string()) .push("download"); - let handle = match self.handle { - Some(ref mut handle) => handle, - None => { - self.handle = Some(ops::http_handle(self.config)?); - self.handle.as_mut().unwrap() - } - }; // TODO: don't download into memory, but ensure that if we ctrl-c a // download we should resume either from the start or the middle // on the next time let url = url.to_string(); + let mut handle = self.easy()?.borrow_mut(); handle.get(true)?; handle.url(&url)?; handle.follow_location(true)?; @@ -192,6 +250,13 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> { } } +impl<'cfg> Drop for RemoteRegistry<'cfg> { + fn drop(&mut self) { + // Just be sure to drop this before our other fields + self.tree.borrow_mut().take(); + } +} + /// Updating the index is done pretty regularly so we want it to be as fast as /// possible. For registries hosted on github (like the crates.io index) there's /// a fast path available to use [1] to tell us that there's no updates to be