Skip to content

Commit

Permalink
Don't check out the crates.io index locally
Browse files Browse the repository at this point in the history
This commit moves working with the crates.io index to operating on the git
object layers rather than actually literally checking out the index. This is
aimed at two different goals:

* Improving the on-disk file size of the registry
* Improving cloning times for the registry as the index doesn't need to be
  checked out

The on disk size of my `registry` folder of a fresh check out of the index went
form 124M to 48M, saving a good chunk of space! The entire operation took about
0.6s less on a Unix machine (out of 4.7s total for current Cargo). On Windows,
however, the clone operation went from 11s to 6.7s, a much larger improvement!

Closes #4015
  • Loading branch information
alexcrichton committed May 11, 2017
1 parent c00e56d commit 15cc376
Show file tree
Hide file tree
Showing 4 changed files with 177 additions and 91 deletions.
85 changes: 50 additions & 35 deletions src/cargo/sources/registry/index.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
use std::collections::HashMap;
use std::io::prelude::*;
use std::fs::File;
use std::path::Path;
use std::str;

use serde_json;

use core::dependency::{Dependency, DependencyInner, Kind};
use core::{SourceId, Summary, PackageId, Registry};
use sources::registry::{RegistryPackage, RegistryDependency, INDEX_LOCK};
use sources::registry::RegistryData;
use util::{CargoResult, ChainError, internal, Filesystem, Config};
use util::human;

pub struct RegistryIndex<'cfg> {
source_id: SourceId,
Expand All @@ -23,7 +24,8 @@ impl<'cfg> RegistryIndex<'cfg> {
pub fn new(id: &SourceId,
path: &Filesystem,
config: &'cfg Config,
locked: bool) -> RegistryIndex<'cfg> {
locked: bool)
-> RegistryIndex<'cfg> {
RegistryIndex {
source_id: id.clone(),
path: path.clone(),
Expand All @@ -35,13 +37,16 @@ impl<'cfg> RegistryIndex<'cfg> {
}

/// Return the hash listed for a specified PackageId.
pub fn hash(&mut self, pkg: &PackageId) -> CargoResult<String> {
pub fn hash(&mut self,
pkg: &PackageId,
load: &mut RegistryData)
-> CargoResult<String> {
let key = (pkg.name().to_string(), pkg.version().to_string());
if let Some(s) = self.hashes.get(&key) {
return Ok(s.clone())
}
// Ok, we're missing the key, so parse the index file to load it.
self.summaries(pkg.name())?;
self.summaries(pkg.name(), load)?;
self.hashes.get(&key).chain_error(|| {
internal(format!("no hash listed for {}", pkg))
}).map(|s| s.clone())
Expand All @@ -51,20 +56,26 @@ impl<'cfg> RegistryIndex<'cfg> {
///
/// Returns a list of pairs of (summary, yanked) for the package name
/// specified.
pub fn summaries(&mut self, name: &str) -> CargoResult<&Vec<(Summary, bool)>> {
pub fn summaries(&mut self,
name: &str,
load: &mut RegistryData)
-> CargoResult<&Vec<(Summary, bool)>> {
if self.cache.contains_key(name) {
return Ok(&self.cache[name]);
}
let summaries = self.load_summaries(name)?;
let summaries = self.load_summaries(name, load)?;
let summaries = summaries.into_iter().filter(|summary| {
summary.0.package_id().name() == name
}).collect();
self.cache.insert(name.to_string(), summaries);
Ok(&self.cache[name])
}

fn load_summaries(&mut self, name: &str) -> CargoResult<Vec<(Summary, bool)>> {
let (path, _lock) = if self.locked {
fn load_summaries(&mut self,
name: &str,
load: &mut RegistryData)
-> CargoResult<Vec<(Summary, bool)>> {
let (root, _lock) = if self.locked {
let lock = self.path.open_ro(Path::new(INDEX_LOCK),
self.config,
"the registry index");
Expand All @@ -84,25 +95,32 @@ impl<'cfg> RegistryIndex<'cfg> {

// see module comment for why this is structured the way it is
let path = match fs_name.len() {
1 => path.join("1").join(&fs_name),
2 => path.join("2").join(&fs_name),
3 => path.join("3").join(&fs_name[..1]).join(&fs_name),
_ => path.join(&fs_name[0..2])
.join(&fs_name[2..4])
.join(&fs_name),
1 => format!("1/{}", fs_name),
2 => format!("2/{}", fs_name),
3 => format!("3/{}/{}", &fs_name[..1], fs_name),
_ => format!("{}/{}/{}", &fs_name[0..2], &fs_name[2..4], fs_name),
// 1 => Path::new("1").join(fs_name),
// 2 => Path::new("2").join(fs_name),
// 3 => Path::new("3").join(&fs_name[..1]).join(fs_name),
// _ => Path::new(&fs_name[0..2]).join(&fs_name[2..4]).join(fs_name),
};
match File::open(&path) {
Ok(mut f) => {
let mut contents = String::new();
f.read_to_string(&mut contents)?;
let ret: CargoResult<Vec<(Summary, bool)>>;
ret = contents.lines().filter(|l| !l.trim().is_empty())
.map(|l| self.parse_registry_package(l))
.collect();
ret.chain_error(|| {
internal(format!("failed to parse registry's information \
for: {}", name))
})
match load.load(&root, Path::new(&path)) {
Ok(contents) => {
let contents = str::from_utf8(&contents).map_err(|_| {
human("registry index file was not valid utf-8")
})?;
let lines = contents.lines()
.map(|s| s.trim())
.filter(|l| !l.is_empty());

// Attempt forwards-compatibility on the index by ignoring
// everything that we ourselves don't understand, that should
// allow future cargo implementations to break the
// interpretation of each line here and older cargo will simply
// ignore the new lines.
Ok(lines.filter_map(|line| {
self.parse_registry_package(line).ok()
}).collect())
}
Err(..) => Ok(Vec::new()),
}
Expand Down Expand Up @@ -161,12 +179,13 @@ impl<'cfg> RegistryIndex<'cfg> {
.set_kind(kind)
.into_dependency())
}
}

impl<'cfg> Registry for RegistryIndex<'cfg> {
fn query(&mut self, dep: &Dependency) -> CargoResult<Vec<Summary>> {
pub fn query(&mut self,
dep: &Dependency,
load: &mut RegistryData)
-> CargoResult<Vec<Summary>> {
let mut summaries = {
let summaries = self.summaries(dep.name())?;
let summaries = self.summaries(dep.name(), load)?;
summaries.iter().filter(|&&(_, yanked)| {
dep.source_id().precise().is_some() || !yanked
}).map(|s| s.0.clone()).collect::<Vec<_>>()
Expand All @@ -188,8 +207,4 @@ impl<'cfg> Registry for RegistryIndex<'cfg> {
});
summaries.query(dep)
}

fn supports_checksums(&self) -> bool {
true
}
}
9 changes: 7 additions & 2 deletions src/cargo/sources/registry/local.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@ use rustc_serialize::hex::ToHex;

use core::PackageId;
use sources::registry::{RegistryData, RegistryConfig};
use util::{Config, CargoResult, ChainError, human, Sha256, Filesystem};
use util::FileLock;
use util::paths;
use util::{Config, CargoResult, ChainError, human, Sha256, Filesystem};

pub struct LocalRegistry<'cfg> {
index_path: Filesystem,
Expand All @@ -34,7 +35,11 @@ impl<'cfg> RegistryData for LocalRegistry<'cfg> {
&self.index_path
}

fn config(&self) -> CargoResult<Option<RegistryConfig>> {
fn load(&self, root: &Path, path: &Path) -> CargoResult<Vec<u8>> {
paths::read_bytes(&root.join(path))
}

fn config(&mut self) -> CargoResult<Option<RegistryConfig>> {
// Local registries don't have configuration for remote APIs or anything
// like that
Ok(None)
Expand Down
13 changes: 7 additions & 6 deletions src/cargo/sources/registry/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,8 @@ struct RegistryDependency {

pub trait RegistryData {
fn index_path(&self) -> &Filesystem;
fn config(&self) -> CargoResult<Option<RegistryConfig>>;
fn load(&self, root: &Path, path: &Path) -> CargoResult<Vec<u8>>;
fn config(&mut self) -> CargoResult<Option<RegistryConfig>>;
fn update_index(&mut self) -> CargoResult<()>;
fn download(&mut self,
pkg: &PackageId,
Expand Down Expand Up @@ -274,7 +275,7 @@ impl<'cfg> RegistrySource<'cfg> {
/// Decode the configuration stored within the registry.
///
/// This requires that the index has been at least checked out.
pub fn config(&self) -> CargoResult<Option<RegistryConfig>> {
pub fn config(&mut self) -> CargoResult<Option<RegistryConfig>> {
self.ops.config()
}

Expand Down Expand Up @@ -323,12 +324,12 @@ impl<'cfg> Registry for RegistrySource<'cfg> {
// come back with no summaries, then our registry may need to be
// updated, so we fall back to performing a lazy update.
if dep.source_id().precise().is_some() && !self.updated {
if self.index.query(dep)?.is_empty() {
if self.index.query(dep, &mut *self.ops)?.is_empty() {
self.do_update()?;
}
}

self.index.query(dep)
self.index.query(dep, &mut *self.ops)
}

fn supports_checksums(&self) -> bool {
Expand Down Expand Up @@ -356,7 +357,7 @@ impl<'cfg> Source for RegistrySource<'cfg> {
}

fn download(&mut self, package: &PackageId) -> CargoResult<Package> {
let hash = self.index.hash(package)?;
let hash = self.index.hash(package, &mut *self.ops)?;
let path = self.ops.download(package, &hash)?;
let path = self.unpack_package(package, &path).chain_error(|| {
internal(format!("failed to unpack package `{}`", package))
Expand All @@ -369,7 +370,7 @@ impl<'cfg> Source for RegistrySource<'cfg> {
// differ due to historical Cargo bugs. To paper over these we trash the
// *summary* loaded from the Cargo.toml we just downloaded with the one
// we loaded from the index.
let summaries = self.index.summaries(package.name())?;
let summaries = self.index.summaries(package.name(), &mut *self.ops)?;
let summary = summaries.iter().map(|s| &s.0).find(|s| {
s.package_id() == package
}).expect("summary not found");
Expand Down
Loading

0 comments on commit 15cc376

Please sign in to comment.