Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 28 additions & 22 deletions crates/uv-cache-info/src/cache_info.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use serde::Deserialize;
use tracing::{debug, warn};

use crate::git_info::{Commit, Tags};
use crate::glob::cluster_globs;
use crate::timestamp::Timestamp;

#[derive(Debug, thiserror::Error)]
Expand Down Expand Up @@ -212,34 +213,39 @@ impl CacheInfo {
}
}

// If we have any globs, process them in a single pass.
// If we have any globs, first cluster them using LCP and then do a single pass on each group.
if !globs.is_empty() {
let walker = globwalk::GlobWalkerBuilder::from_patterns(directory, &globs)
for (glob_base, glob_patterns) in cluster_globs(&globs) {
let walker = globwalk::GlobWalkerBuilder::from_patterns(
directory.join(glob_base),
&glob_patterns,
)
.file_type(globwalk::FileType::FILE | globwalk::FileType::SYMLINK)
.build()?;
for entry in walker {
let entry = match entry {
Ok(entry) => entry,
Err(err) => {
warn!("Failed to read glob entry: {err}");
continue;
}
};
let metadata = match entry.metadata() {
Ok(metadata) => metadata,
Err(err) => {
warn!("Failed to read metadata for glob entry: {err}");
for entry in walker {
let entry = match entry {
Ok(entry) => entry,
Err(err) => {
warn!("Failed to read glob entry: {err}");
continue;
}
};
let metadata = match entry.metadata() {
Ok(metadata) => metadata,
Err(err) => {
warn!("Failed to read metadata for glob entry: {err}");
continue;
}
};
if !metadata.is_file() {
warn!(
"Expected file for cache key, but found directory: `{}`",
entry.path().display()
);
continue;
}
};
if !metadata.is_file() {
warn!(
"Expected file for cache key, but found directory: `{}`",
entry.path().display()
);
continue;
timestamp = max(timestamp, Some(Timestamp::from_metadata(&metadata)));
}
timestamp = max(timestamp, Some(Timestamp::from_metadata(&metadata)));
}
}

Expand Down
318 changes: 318 additions & 0 deletions crates/uv-cache-info/src/glob.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,318 @@
use std::{
collections::BTreeMap,
path::{Component, Components, Path, PathBuf},
};

/// Check if a component of the path looks like it may be a glob pattern.
///
/// Note: this function is being used when splitting a glob pattern into a long possible
/// base and the glob remainder (scanning through components until we hit the first component
/// for which this function returns true). It is acceptable for this function to return
/// false positives (e.g. patterns like 'foo[bar' or 'foo{bar') in which case correctness
/// will not be affected but efficiency might be (because we'll traverse more than we should),
/// however it should not return false negatives.
fn is_glob_like(part: Component) -> bool {
matches!(part, Component::Normal(_))
&& part.as_os_str().to_str().is_some_and(|part| {
["*", "{", "}", "?", "[", "]"]
.into_iter()
.any(|c| part.contains(c))
})
}

#[derive(Debug, Default, Clone, PartialEq, Eq)]
struct GlobParts {
base: PathBuf,
pattern: PathBuf,
}

/// Split a glob into longest possible base + shortest possible glob pattern.
fn split_glob(pattern: impl AsRef<str>) -> GlobParts {
let pattern: &Path = pattern.as_ref().as_ref();

let mut glob = GlobParts::default();
let mut globbing = false;
let mut last = None;

for part in pattern.components() {
if let Some(last) = last {
if last != Component::CurDir {
if globbing {
glob.pattern.push(last);
} else {
glob.base.push(last);
}
}
}
if !globbing {
globbing = is_glob_like(part);
}
// we don't know if this part is the last one, defer handling it by one iteration
last = Some(part);
}

if let Some(last) = last {
// defer handling the last component to prevent draining entire pattern into base
if globbing || matches!(last, Component::Normal(_)) {
glob.pattern.push(last);
} else {
glob.base.push(last);
}
}
glob
}

/// Classic trie with edges being path components and values being glob patterns.
#[derive(Default)]
struct Trie<'a> {
children: BTreeMap<Component<'a>, Trie<'a>>,
patterns: Vec<&'a Path>,
}

impl<'a> Trie<'a> {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like this Trie implementation makes pretty heavy use of recursion. And the call depth is in turn proportional to the length of data controlled by end users. That's generally something I think we should try to avoid. Although, in this case, paths are likely limited in length in most cases to a point where the call depth isn't going to get deep enough to overflow the stack. But I'm a little uneasy about relying on such things since that's more of a platform restriction as opposed to something we check and enforce ourselves.

Copy link
Copy Markdown
Contributor Author

@aldanor aldanor Jun 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

aldanor/multiglob@bbd43be

(fixed in that PR because there's some edge cases fixed there and lots of tests, including multi-platform ones)

// Apologies for all the confusion, I decided to pull it out into a standalone crate after @konstin asked if we can have less complexity in uv itself; but this also allowed to thoroughly go through all the multi-platform edge cases and write tons of tests. If uv requires this crate under its control I'm ok with passing ownership or whatever else you guys want (although it may be pretty useful for general public).

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Personally, I'd rather the small targeted change you have in this PR. Maintaining a whole other crate for this is a lot to ask.

I'm not keen on depending on a crate made just so that the code doesn't live in uv proper. Its long term maintenance story isn't clear to me. I can't speak for @konstin, but "off the shelf components" to me means existing components. Sorry about the confusion here.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm different to @BurntSushi here for maintainability, he has more experience with glob implementations than me

fn insert(&mut self, mut components: Components<'a>, pattern: &'a Path) {
if let Some(part) = components.next() {
self.children
.entry(part)
.or_default()
.insert(components, pattern);
} else {
self.patterns.push(pattern);
}
}

#[allow(clippy::needless_pass_by_value)]
fn collect_patterns(
&self,
pattern_prefix: PathBuf,
group_prefix: PathBuf,
patterns: &mut Vec<PathBuf>,
groups: &mut Vec<(PathBuf, Vec<PathBuf>)>,
) {
// collect all patterns beneath and including this node
for pattern in &self.patterns {
patterns.push(pattern_prefix.join(pattern));
}
for (part, child) in &self.children {
if let Component::Normal(_) = part {
// for normal components, collect all descendant patterns ('normal' edges only)
child.collect_patterns(
pattern_prefix.join(part),
group_prefix.join(part),
patterns,
groups,
);
} else {
// for non-normal component edges, kick off separate group collection at this node
child.collect_groups(group_prefix.join(part), groups);
}
}
}

#[allow(clippy::needless_pass_by_value)]
fn collect_groups(&self, prefix: PathBuf, groups: &mut Vec<(PathBuf, Vec<PathBuf>)>) {
// LCP-style grouping of patterns
if self.patterns.is_empty() {
// no patterns in this node; child nodes can form independent groups
for (part, child) in &self.children {
child.collect_groups(prefix.join(part), groups);
}
} else {
// pivot point, we've hit a pattern node; we have to stop here and form a group
let mut group = Vec::new();
self.collect_patterns(PathBuf::new(), prefix.clone(), &mut group, groups);
groups.push((prefix, group));
}
}
}

/// Given a collection of globs, cluster them into (base, globs) groups so that:
/// - base doesn't contain any glob symbols
/// - each directory would only be walked at most once
/// - base of each group is the longest common prefix of globs in the group
pub(crate) fn cluster_globs(patterns: &[impl AsRef<str>]) -> Vec<(PathBuf, Vec<String>)> {
// split all globs into base/pattern
let globs: Vec<_> = patterns.iter().map(split_glob).collect();

// construct a path trie out of all split globs
let mut trie = Trie::default();
for glob in &globs {
trie.insert(glob.base.components(), &glob.pattern);
}

// run LCP-style aggregation of patterns in the trie into groups
let mut groups = Vec::new();
trie.collect_groups(PathBuf::new(), &mut groups);

// finally, convert resulting patterns to strings
groups
.into_iter()
.map(|(base, patterns)| {
(
base,
patterns
.iter()
// NOTE: this unwrap is ok because input patterns are valid utf-8
.map(|p| p.to_str().unwrap().to_owned())
.collect(),
)
})
.collect()
}

#[cfg(test)]
mod tests {
use super::{GlobParts, cluster_globs, split_glob};

fn windowsify(path: &str) -> String {
if cfg!(windows) {
path.replace('/', "\\")
} else {
path.to_owned()
}
}

#[test]
fn test_split_glob() {
#[track_caller]
fn check(input: &str, base: &str, pattern: &str) {
let result = split_glob(input);
let expected = GlobParts {
base: base.into(),
pattern: pattern.into(),
};
assert_eq!(result, expected, "{input:?} != {base:?} + {pattern:?}");
}

check("", "", "");
check("a", "", "a");
check("a/b", "a", "b");
check("a/b/", "a", "b");
check("a/.//b/", "a", "b");
check("./a/b/c", "a/b", "c");
check("c/d/*", "c/d", "*");
check("c/d/*/../*", "c/d", "*/../*");
check("a/?b/c", "a", "?b/c");
check("/a/b/*", "/a/b", "*");
check("../x/*", "../x", "*");
check("a/{b,c}/d", "a", "{b,c}/d");
check("a/[bc]/d", "a", "[bc]/d");
check("*", "", "*");
check("*/*", "", "*/*");
check("..", "..", "");
check("/", "/", "");
}

#[test]
fn test_cluster_globs() {
#[track_caller]
fn check(input: &[&str], expected: &[(&str, &[&str])]) {
let input = input.iter().map(|s| windowsify(s)).collect::<Vec<_>>();

let mut result_sorted = cluster_globs(&input);
for (_, patterns) in &mut result_sorted {
patterns.sort_unstable();
}
result_sorted.sort_unstable();

let mut expected_sorted = Vec::new();
for (base, patterns) in expected {
let mut patterns_sorted = Vec::new();
for pattern in *patterns {
patterns_sorted.push(windowsify(pattern));
}
patterns_sorted.sort_unstable();
expected_sorted.push((windowsify(base).into(), patterns_sorted));
}
expected_sorted.sort_unstable();

assert_eq!(
result_sorted, expected_sorted,
"{input:?} != {expected_sorted:?} (got: {result_sorted:?})"
);
}

check(&["a/b/*", "a/c/*"], &[("a/b", &["*"]), ("a/c", &["*"])]);
check(&["./a/b/*", "a/c/*"], &[("a/b", &["*"]), ("a/c", &["*"])]);
check(&["/a/b/*", "/a/c/*"], &[("/a/b", &["*"]), ("/a/c", &["*"])]);
check(
&["../a/b/*", "../a/c/*"],
&[("../a/b", &["*"]), ("../a/c", &["*"])],
);
check(&["x/*", "y/*"], &[("x", &["*"]), ("y", &["*"])]);
check(&[], &[]);
check(
&["./*", "a/*", "../foo/*.png"],
&[("", &["*", "a/*"]), ("../foo", &["*.png"])],
);
check(
&[
"?",
"/foo/?",
"/foo/bar/*",
"../bar/*.png",
"../bar/../baz/*.jpg",
],
&[
("", &["?"]),
("/foo", &["?", "bar/*"]),
("../bar", &["*.png"]),
("../bar/../baz", &["*.jpg"]),
],
);
check(&["/abs/path/*"], &[("/abs/path", &["*"])]);
check(&["/abs/*", "rel/*"], &[("/abs", &["*"]), ("rel", &["*"])]);
check(&["a/{b,c}/*", "a/d?/*"], &[("a", &["{b,c}/*", "d?/*"])]);
check(
&[
"../shared/a/[abc].png",
"../shared/a/b/*",
"../shared/b/c/?x/d",
"docs/important/*.{doc,xls}",
"docs/important/very/*",
],
&[
("../shared/a", &["[abc].png", "b/*"]),
("../shared/b/c", &["?x/d"]),
("docs/important", &["*.{doc,xls}", "very/*"]),
],
);
check(&["file.txt"], &[("", &["file.txt"])]);
check(&["/"], &[("/", &[""])]);
check(&[".."], &[("..", &[""])]);
check(
&["file1.txt", "file2.txt"],
&[("", &["file1.txt", "file2.txt"])],
);
check(
&["a/file1.txt", "a/file2.txt"],
&[("a", &["file1.txt", "file2.txt"])],
);
check(
&["*", "a/b/*", "a/../c/*.jpg", "a/../c/*.png", "/a/*", "/b/*"],
&[
("", &["*", "a/b/*"]),
("a/../c", &["*.jpg", "*.png"]),
("/a", &["*"]),
("/b", &["*"]),
],
);

if cfg!(windows) {
check(
&[
r"\\foo\bar\shared/a/[abc].png",
r"\\foo\bar\shared/a/b/*",
r"\\foo\bar/shared/b/c/?x/d",
r"D:\docs\important/*.{doc,xls}",
r"D:\docs/important/very/*",
],
&[
(r"\\foo\bar\shared\a", &["[abc].png", r"b\*"]),
(r"\\foo\bar\shared\b\c", &[r"?x\d"]),
(r"D:\docs\important", &["*.{doc,xls}", r"very\*"]),
],
);
}
}
}
1 change: 1 addition & 0 deletions crates/uv-cache-info/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ pub use crate::timestamp::*;

mod cache_info;
mod git_info;
mod glob;
mod timestamp;
Loading