diff --git a/Cargo.lock b/Cargo.lock index 4aec0304c4..ef4ec03655 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4314,7 +4314,6 @@ dependencies = [ "http 1.2.0", "human_bytes", "humantime", - "ignore", "indexmap 2.7.1", "indicatif", "insta", @@ -4357,6 +4356,7 @@ dependencies = [ "rattler_shell", "rattler_solve", "rattler_virtual_packages", + "rayon", "regex", "reqwest", "reqwest-middleware", diff --git a/Cargo.toml b/Cargo.toml index 2b7d37ebc7..92b55f9f43 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -47,7 +47,6 @@ http = "1.2.0" http-cache-reqwest = "0.15.1" human_bytes = "0.4.3" humantime = "2.1.0" -ignore = "0.4.23" indexmap = "2.7.1" indicatif = "0.17.11" insta = "1.42.1" @@ -240,7 +239,6 @@ fs_extra = { workspace = true } futures = { workspace = true } human_bytes = { workspace = true } humantime = { workspace = true } -ignore = { workspace = true } indexmap = { workspace = true, features = ["serde"] } indicatif = { workspace = true } is_executable = { workspace = true } @@ -298,6 +296,7 @@ pixi_uv_conversions = { workspace = true } pypi_mapping = { workspace = true } pypi_modifiers = { workspace = true } rattler_virtual_packages = { workspace = true } +rayon = "1.10.0" regex = { workspace = true } reqwest = { workspace = true, features = [ "http2", diff --git a/crates/pixi_glob/src/glob_set.rs b/crates/pixi_glob/src/glob_set.rs index 2d70e97ae0..6f04894699 100644 --- a/crates/pixi_glob/src/glob_set.rs +++ b/crates/pixi_glob/src/glob_set.rs @@ -7,7 +7,8 @@ use itertools::{Either, Itertools}; use thiserror::Error; use wax::{Glob, WalkEntry}; -pub(crate) struct GlobSet<'t> { +/// A set of globs to include and exclude from a directory. +pub struct GlobSet<'t> { /// The globs to include in the filter. pub include: Vec>, /// The globs to exclude from the filter. @@ -15,6 +16,7 @@ pub(crate) struct GlobSet<'t> { } #[derive(Error, Debug)] +#[allow(missing_docs)] pub enum GlobSetError { #[error("failed to access {}", .0.display())] Io(PathBuf, #[source] io::Error), @@ -30,6 +32,10 @@ pub enum GlobSetError { } impl<'t> GlobSet<'t> { + /// Create a new `GlobSet` from a list of globs. + /// + /// The globs are split into inclusion and exclusion globs based on whether they + /// start with `!`. pub fn create(globs: impl IntoIterator) -> Result, GlobSetError> { // Split the globs into inclusion and exclusion globs based on whether they // start with `!`. diff --git a/crates/pixi_glob/src/lib.rs b/crates/pixi_glob/src/lib.rs index 16bf8df4ec..6885a5b55c 100644 --- a/crates/pixi_glob/src/lib.rs +++ b/crates/pixi_glob/src/lib.rs @@ -10,3 +10,4 @@ mod glob_set; pub use glob_hash::{GlobHash, GlobHashError}; pub use glob_hash_cache::{GlobHashCache, GlobHashKey}; pub use glob_mtime::{GlobModificationTime, GlobModificationTimeError}; +pub use glob_set::{GlobSet, GlobSetError}; diff --git a/pixi_docs/Cargo.lock b/pixi_docs/Cargo.lock index 8998cb5f5a..5def575884 100644 --- a/pixi_docs/Cargo.lock +++ b/pixi_docs/Cargo.lock @@ -4167,7 +4167,6 @@ dependencies = [ "futures", "human_bytes", "humantime", - "ignore", "indexmap 2.7.1", "indicatif", "is_executable", @@ -4208,6 +4207,7 @@ dependencies = [ "rattler_shell", "rattler_solve", "rattler_virtual_packages", + "rayon", "regex", "reqwest", "reqwest-middleware", diff --git a/src/task/file_hashes.rs b/src/task/file_hashes.rs index b31782943a..ca3778f396 100644 --- a/src/task/file_hashes.rs +++ b/src/task/file_hashes.rs @@ -7,24 +7,29 @@ //! The main entry-point to compute the hashes of all files in a directory is the //! [`FileHashes::from_files`] method. -use ignore::{overrides::OverrideBuilder, WalkBuilder}; use itertools::Itertools; -use std::hash::Hash; +use pixi_glob::{GlobSet, GlobSetError}; +use rayon::prelude::*; +use std::sync::LazyLock; use std::{ + clone::Clone, collections::HashMap, + fmt::Debug, fs::File, - hash::Hasher, + hash::{Hash, Hasher}, io::{BufRead, BufReader}, path::{Path, PathBuf}, + sync::Arc, }; use thiserror::Error; use tokio::task::JoinError; +use uv_configuration::RAYON_INITIALIZE; use xxhash_rust::xxh3::Xxh3; #[derive(Debug, Error)] pub enum FileHashesError { #[error(transparent)] - WalkError(#[from] ignore::Error), + WalkError(#[from] GlobSetError), #[error("I/O error while reading file {0}")] IoError(PathBuf, #[source] std::io::Error), @@ -72,10 +77,10 @@ impl FileHashes { } // Construct the custom filter - let mut ignore_builder = OverrideBuilder::new(root); + let mut ignore_builder = vec![]; for ignore_line in filters { let path = root.join(ignore_line.as_ref()); - let mut pat = if ignore_line.as_ref().ends_with('/') { + let pat = if ignore_line.as_ref().ends_with('/') { format!("{}**", ignore_line.as_ref()) } else if path.exists() && path.is_dir() { format!("{}/**", ignore_line.as_ref()) @@ -83,20 +88,10 @@ impl FileHashes { ignore_line.as_ref().to_owned() }; - if pat.starts_with('!') && !pat.starts_with("!/") { - // make sure there is a `/` at the 2nd place so that the pattern reads - // `!/**/lib.rs` instead of `!**/lib.rs` - pat.insert(1, '/'); - } else { - // Same for the others, make sure they start in the right folder - if !pat.starts_with('/') { - pat.insert(0, '/'); - } - } - ignore_builder.add(&pat)?; + ignore_builder.push(pat); } - let filter = ignore_builder.build()?; + let glob = GlobSet::create(ignore_builder.iter().map(|s| s.as_str()))?; // Spawn a thread that will collect the results from a channel. let (tx, rx) = crossbeam_channel::bounded(100); @@ -104,43 +99,37 @@ impl FileHashes { tokio::task::spawn_blocking(move || rx.iter().collect::, _>>()); // Iterate over all entries in parallel and send them over a channel to the collection thread. - let collect_root = root.to_owned(); - WalkBuilder::new(root) - .overrides(filter) - .hidden(false) - .git_ignore(false) - .git_global(false) - .git_exclude(false) - // Turn this back off as it can cause issues with symlinks: - // https://github.com/prefix-dev/pixi/issues/2196 - // TODO: The current idea is to completely reimplement this without the `ignore` crate. - // .follow_links(true) - .build_parallel() - .run(|| { - let tx = tx.clone(); - let collect_root = collect_root.clone(); - Box::new(move |entry| { - let result = match entry { - Ok(entry) if entry.file_type().map(|t| t.is_dir()).unwrap_or(false) => { - return ignore::WalkState::Continue; - } - Ok(entry) => compute_file_hash(entry.path()).map(|hash| { - let path = entry - .path() - .strip_prefix(&collect_root) - .expect("path is not prefixed by the root"); - tracing::info!("Added hash for file: {:?}", path); - (path.to_owned(), hash) - }), - Err(e) => Err(FileHashesError::from(e)), - }; - match (result.is_err(), tx.send(result)) { - (true, _) => ignore::WalkState::Quit, - (_, Err(_)) => ignore::WalkState::Quit, - _ => ignore::WalkState::Continue, - } + let collect_root = Arc::new(root.to_owned()); + + // Collect all entries first to avoid holding lock during iteration + let entries: Vec<_> = glob.filter_directory(root).collect::, _>>()?; + + // Force the initialization of the rayon thread pool to avoid implicit creation + // by the Installer. + LazyLock::force(&RAYON_INITIALIZE); + + // Process entries in parallel using rayon + entries.into_par_iter().for_each(|entry| { + let tx = tx.clone(); + let collect_root = Arc::clone(&collect_root); + + let result: Result<(PathBuf, String), FileHashesError> = if entry.file_type().is_dir() { + // Skip directories + return; + } else { + compute_file_hash(entry.path()).map(|hash| { + let path = entry + .path() + .strip_prefix(&*collect_root) + .expect("path is not prefixed by the root"); + tracing::info!("Added hash for file: {:?}", path); + (path.to_owned(), hash) }) - }); + }; + + // Send result to channel - if it fails, we just continue with the next item + let _ = tx.send(result); + }); // Drop the local handle to the channel. This will close the channel which in turn will // cause the collection thread to finish which allows us to join without deadlocking.