diff --git a/src/task/task_source_checker.rs b/src/task/task_source_checker.rs index 9bbf8d63ec..dff95611ed 100644 --- a/src/task/task_source_checker.rs +++ b/src/task/task_source_checker.rs @@ -2,14 +2,20 @@ use crate::config::{Config, Settings}; use crate::dirs; use crate::file::{self, display_path}; use crate::hash; +use crate::rand::random_string; use crate::task::Task; use eyre::{Result, eyre}; +use flate2::Compression; +use flate2::read::ZlibDecoder; +use flate2::write::ZlibEncoder; use glob::glob; use ignore::overrides::{Override, OverrideBuilder}; use itertools::Itertools; +use serde::{Deserialize, Serialize}; use std::collections::BTreeMap; -use std::fs; +use std::fs::{self, File}; use std::hash::{DefaultHasher, Hash, Hasher}; +use std::io::{Read, Write}; use std::path::{Path, PathBuf}; use std::sync::Arc; use std::time::{SystemTime, UNIX_EPOCH}; @@ -193,8 +199,6 @@ pub async fn sources_are_fresh(task: &Task, config: &Arc) -> Result Result { let root = task_cwd(task, config).await?; let matcher = build_source_matcher(&root, &task.sources); @@ -238,7 +242,13 @@ pub async fn sources_are_fresh(task: &Task, config: &Arc) -> Result) -> Result<()> { Ok(()) } -/// Get the path to store source hashes for a task -fn sources_hash_path(task: &Task, root: &Path, content_hash: bool) -> PathBuf { +/// Identity hash for a task in a given working directory. Used as the +/// filename stem for any per-task state we write under `STATE/task-sources/`, +/// so that changes to the task definition (sources, cmd, etc.), the config +/// file it came from, or the working directory all invalidate state in +/// lock-step. +fn task_state_key(task: &Task, root: &Path) -> String { let mut hasher = DefaultHasher::new(); task.hash(&mut hasher); task.config_source.hash(&mut hasher); root.hash(&mut hasher); - let hash = format!("{:x}", hasher.finish()); + format!("{:x}", hasher.finish()) +} + +/// Get the path to store source hashes for a task +fn sources_hash_path(task: &Task, root: &Path, content_hash: bool) -> PathBuf { let suffix = if content_hash { "-content" } else { "" }; dirs::STATE .join("task-sources") - .join(format!("{hash}{suffix}")) + .join(format!("{}{suffix}", task_state_key(task, root))) } /// Get the existing source hash for a task, if it exists @@ -392,14 +410,103 @@ fn file_metadatas_to_hash(metadatas: &[(PathBuf, fs::Metadata)]) -> String { hash::hash_to_str(&path_and_sizes) } -/// Convert file contents to a hash string for comparison using blake3 -/// More accurate than metadata hashing but slower since it reads all file contents -fn file_contents_to_hash(metadatas: &[(PathBuf, fs::Metadata)]) -> Result { +/// Per-file content hash cache entry. The `(size, mtime_secs, mtime_nanos)` +/// tuple is the cache key (in the git-style "stat-info" sense): when those +/// three match, we reuse `hash` without re-reading the file. +#[derive(Debug, Serialize, Deserialize)] +struct CachedFileHash { + mtime_secs: i64, + mtime_nanos: u32, + size: u64, + hash: String, +} + +type ContentHashCache = BTreeMap; + +/// Path to the per-task content-hash cache file. Shares `task_state_key` +/// with `sources_hash_path` so changes to the task definition invalidate +/// both in lock-step. +fn content_hash_cache_path(task: &Task, root: &Path) -> PathBuf { + dirs::STATE + .join("task-sources") + .join(format!("{}-content-cache", task_state_key(task, root))) +} + +fn load_content_hash_cache(path: &Path) -> ContentHashCache { + (|| -> Result { + let mut zlib = ZlibDecoder::new(File::open(path)?); + let mut bytes = Vec::new(); + zlib.read_to_end(&mut bytes)?; + Ok(rmp_serde::from_slice(&bytes)?) + })() + .unwrap_or_default() +} + +fn save_content_hash_cache(path: &Path, cache: &ContentHashCache) -> Result<()> { + if let Some(parent) = path.parent() { + file::create_dir_all(parent)?; + } + let partial = path.with_extension(format!("part-{}", random_string(8))); + { + let mut zlib = ZlibEncoder::new(File::create(&partial)?, Compression::fast()); + zlib.write_all(&rmp_serde::to_vec_named(cache)?)?; + // Propagate finalization errors explicitly — ZlibEncoder's Drop impl + // would silently discard them, leaving a truncated partial file that + // we'd then rename into place as a poisoned cache. + zlib.finish()?; + } + file::rename(&partial, path)?; + Ok(()) +} + +fn cached_entry_matches(entry: &CachedFileHash, metadata: &fs::Metadata) -> bool { + let Ok(mtime) = metadata.modified() else { + return false; + }; + let Ok(dur) = mtime.duration_since(UNIX_EPOCH) else { + return false; + }; + entry.size == metadata.len() + && entry.mtime_secs == dur.as_secs() as i64 + && entry.mtime_nanos == dur.subsec_nanos() +} + +fn make_cache_entry(metadata: &fs::Metadata, hash: String) -> CachedFileHash { + let dur = metadata + .modified() + .ok() + .and_then(|m| m.duration_since(UNIX_EPOCH).ok()); + CachedFileHash { + mtime_secs: dur.map(|d| d.as_secs() as i64).unwrap_or(0), + mtime_nanos: dur.map(|d| d.subsec_nanos()).unwrap_or(0), + size: metadata.len(), + hash, + } +} + +/// Convert file contents to a hash string for comparison using blake3. +/// +/// More accurate than metadata hashing but slower since it reads all file +/// contents. `cache` is consulted first: if a file's `(size, mtime_secs, +/// mtime_nanos)` match the cached entry, the stored hash is reused and the +/// file is not re-read. On return, `cache` is rebuilt from scratch with one +/// entry per current source file — entries for files no longer in `sources` +/// are pruned so the cache file size stays bounded. +fn file_contents_to_hash( + metadatas: &[(PathBuf, fs::Metadata)], + cache: &mut ContentHashCache, +) -> Result { let mut content_hashes: Vec<(&PathBuf, String)> = Vec::new(); - for (path, _) in metadatas { - let file_hash = hash::file_hash_blake3(path, None)?; - content_hashes.push((path, file_hash)); + let mut next: ContentHashCache = BTreeMap::new(); + for (path, metadata) in metadatas { + let hash = match cache.get(path) { + Some(entry) if cached_entry_matches(entry, metadata) => entry.hash.clone(), + _ => hash::file_hash_blake3(path, None)?, + }; + next.insert(path.clone(), make_cache_entry(metadata, hash.clone())); + content_hashes.push((path, hash)); } + *cache = next; Ok(hash::hash_to_str(&content_hashes)) } @@ -525,4 +632,83 @@ mod tests { let matcher = build_source_matcher(root, &sources); assert!(is_source(&matcher, Path::new("/elsewhere/Cargo.toml"))); } + + #[test] + fn content_hash_cache_reuses_unchanged_files() { + let tmp = tempfile::tempdir().unwrap(); + let a = tmp.path().join("a.txt"); + let b = tmp.path().join("b.txt"); + std::fs::write(&a, "hello").unwrap(); + std::fs::write(&b, "world").unwrap(); + let metadatas = vec![ + (a.clone(), a.metadata().unwrap()), + (b.clone(), b.metadata().unwrap()), + ]; + + let mut cache = ContentHashCache::new(); + let first = file_contents_to_hash(&metadatas, &mut cache).unwrap(); + assert_eq!(cache.len(), 2); + let a_hash_v1 = cache.get(&a).unwrap().hash.clone(); + + // Re-run with same files: hashes should be reused, aggregate unchanged. + let second = file_contents_to_hash(&metadatas, &mut cache).unwrap(); + assert_eq!(first, second); + assert_eq!(cache.get(&a).unwrap().hash, a_hash_v1); + + // Mutate `a` so size differs; aggregate hash must change. + std::fs::write(&a, "hello world").unwrap(); + let metadatas = vec![ + (a.clone(), a.metadata().unwrap()), + (b.clone(), b.metadata().unwrap()), + ]; + let third = file_contents_to_hash(&metadatas, &mut cache).unwrap(); + assert_ne!(second, third); + assert_ne!(cache.get(&a).unwrap().hash, a_hash_v1); + } + + #[test] + fn content_hash_cache_prunes_dropped_files() { + let tmp = tempfile::tempdir().unwrap(); + let a = tmp.path().join("a.txt"); + let b = tmp.path().join("b.txt"); + std::fs::write(&a, "hello").unwrap(); + std::fs::write(&b, "world").unwrap(); + + let mut cache = ContentHashCache::new(); + let metadatas = vec![ + (a.clone(), a.metadata().unwrap()), + (b.clone(), b.metadata().unwrap()), + ]; + file_contents_to_hash(&metadatas, &mut cache).unwrap(); + assert_eq!(cache.len(), 2); + + // Only `a` is a source this run — `b` should drop out of the cache. + let metadatas = vec![(a.clone(), a.metadata().unwrap())]; + file_contents_to_hash(&metadatas, &mut cache).unwrap(); + assert_eq!(cache.len(), 1); + assert!(cache.contains_key(&a)); + assert!(!cache.contains_key(&b)); + } + + #[test] + fn content_hash_cache_round_trips_through_disk() { + let tmp = tempfile::tempdir().unwrap(); + let a = tmp.path().join("a.txt"); + std::fs::write(&a, "hello").unwrap(); + + let mut cache = ContentHashCache::new(); + let metadatas = vec![(a.clone(), a.metadata().unwrap())]; + file_contents_to_hash(&metadatas, &mut cache).unwrap(); + + let cache_path = tmp.path().join("cache.bin"); + save_content_hash_cache(&cache_path, &cache).unwrap(); + let loaded = load_content_hash_cache(&cache_path); + assert_eq!(loaded.len(), 1); + assert_eq!(loaded.get(&a).unwrap().hash, cache.get(&a).unwrap().hash,); + + // Corrupt the file: loader must silently fall back to empty. + std::fs::write(&cache_path, b"not a valid msgpack stream").unwrap(); + let loaded = load_content_hash_cache(&cache_path); + assert!(loaded.is_empty()); + } }