diff --git a/lychee-bin/src/commands/dump.rs b/lychee-bin/src/commands/dump.rs index 9890fe7324..26df6f4018 100644 --- a/lychee-bin/src/commands/dump.rs +++ b/lychee-bin/src/commands/dump.rs @@ -1,10 +1,8 @@ use log::error; use lychee_lib::Request; use lychee_lib::Result; -use lychee_lib::filter::PathExcludes; use std::fs; use std::io::{self, Write}; -use std::path::PathBuf; use tokio_stream::StreamExt; use crate::ExitCode; @@ -12,24 +10,6 @@ use crate::verbosity::Verbosity; use super::CommandParams; -// Helper function to create an output writer. -// -// If the output file is not specified, it will use `stdout`. -// -// # Errors -// -// If the output file cannot be opened, an error is returned. -fn create_writer(output: Option) -> Result> { - let out = if let Some(output) = output { - let out = fs::OpenOptions::new().append(true).open(output)?; - Box::new(out) as Box - } else { - let out = io::stdout(); - Box::new(out.lock()) as Box - }; - Ok(out) -} - /// Dump all detected links to stdout without checking them pub(crate) async fn dump(params: CommandParams) -> Result where @@ -42,7 +22,7 @@ where fs::File::create(out_file)?; } - let mut writer = create_writer(params.cfg.output)?; + let mut writer = super::create_writer(params.cfg.output)?; while let Some(request) = requests.next().await { let mut request = request?; @@ -71,36 +51,6 @@ where Ok(ExitCode::Success) } -/// Dump all input sources to stdout without extracting any links and checking -/// them. -pub(crate) async fn dump_inputs( - sources: S, - output: Option<&PathBuf>, - excluded_paths: &PathExcludes, -) -> Result -where - S: futures::Stream>, -{ - if let Some(out_file) = output { - fs::File::create(out_file)?; - } - - let mut writer = create_writer(output.cloned())?; - - tokio::pin!(sources); - while let Some(source) = sources.next().await { - let source = source?; - - if excluded_paths.is_match(&source) { - continue; - } - - writeln!(writer, "{source}")?; - } - - Ok(ExitCode::Success) -} - /// Dump request to stdout fn write( writer: &mut Box, @@ -134,97 +84,3 @@ fn write( fn write_out(writer: &mut Box, out_str: &str) -> io::Result<()> { writeln!(writer, "{out_str}") } - -#[cfg(test)] -mod tests { - use super::*; - use futures::stream; - use tempfile::NamedTempFile; - - #[tokio::test] - async fn test_dump_inputs_basic() -> Result<()> { - // Create temp file for output - let temp_file = NamedTempFile::new()?; - let output_path = temp_file.path().to_path_buf(); - - // Create test input stream - let inputs = vec![ - Ok(String::from("test/path1")), - Ok(String::from("test/path2")), - Ok(String::from("test/path3")), - ]; - let stream = stream::iter(inputs); - - // Run dump_inputs - let result = dump_inputs(stream, Some(&output_path), &PathExcludes::empty()).await?; - assert_eq!(result, ExitCode::Success); - - // Verify output - let contents = fs::read_to_string(&output_path)?; - assert_eq!(contents, "test/path1\ntest/path2\ntest/path3\n"); - Ok(()) - } - - #[tokio::test] - async fn test_dump_inputs_with_excluded_paths() -> Result<()> { - let temp_file = NamedTempFile::new()?; - let output_path = temp_file.path().to_path_buf(); - - let inputs = vec![ - Ok(String::from("test/path1")), - Ok(String::from("excluded/path")), - Ok(String::from("test/path2")), - ]; - let stream = stream::iter(inputs); - - let excluded = &PathExcludes::new(["excluded"]).unwrap(); - let result = dump_inputs(stream, Some(&output_path), excluded).await?; - assert_eq!(result, ExitCode::Success); - - let contents = fs::read_to_string(&output_path)?; - assert_eq!(contents, "test/path1\ntest/path2\n"); - Ok(()) - } - - #[tokio::test] - async fn test_dump_inputs_empty_stream() -> Result<()> { - let temp_file = NamedTempFile::new()?; - let output_path = temp_file.path().to_path_buf(); - - let stream = stream::iter::>>(vec![]); - let result = dump_inputs(stream, Some(&output_path), &PathExcludes::empty()).await?; - assert_eq!(result, ExitCode::Success); - - let contents = fs::read_to_string(&output_path)?; - assert_eq!(contents, ""); - Ok(()) - } - - #[tokio::test] - async fn test_dump_inputs_error_in_stream() -> Result<()> { - let temp_file = NamedTempFile::new()?; - let output_path = temp_file.path().to_path_buf(); - - let inputs: Vec> = vec![ - Ok(String::from("test/path1")), - Err(io::Error::other("test error").into()), - Ok(String::from("test/path2")), - ]; - let stream = stream::iter(inputs); - - let result = dump_inputs(stream, Some(&output_path), &PathExcludes::empty()).await; - assert!(result.is_err()); - Ok(()) - } - - #[tokio::test] - async fn test_dump_inputs_to_stdout() -> Result<()> { - // When output path is None, should write to stdout - let inputs = vec![Ok(String::from("test/path1"))]; - let stream = stream::iter(inputs); - - let result = dump_inputs(stream, None, &PathExcludes::empty()).await?; - assert_eq!(result, ExitCode::Success); - Ok(()) - } -} diff --git a/lychee-bin/src/commands/dump_inputs.rs b/lychee-bin/src/commands/dump_inputs.rs new file mode 100644 index 0000000000..05f8164241 --- /dev/null +++ b/lychee-bin/src/commands/dump_inputs.rs @@ -0,0 +1,58 @@ +use lychee_lib::{FileExtensions, Input, Result}; +use std::collections::HashSet; +use std::fs; +use std::io::{self, Write}; +use std::path::PathBuf; +use tokio_stream::StreamExt; + +use crate::ExitCode; + +/// Print all input sources to stdout, without extracting or checking links. +/// +/// This command outputs the resolved input sources that would be processed +/// by lychee, including file paths, URLs, and special sources like stdin. +/// It respects file extension filtering and path exclusions. +pub(crate) async fn dump_inputs( + inputs: HashSet, + output: Option<&PathBuf>, + excluded_paths: &[String], + valid_extensions: &FileExtensions, + skip_hidden: bool, + skip_gitignored: bool, +) -> Result { + if let Some(out_file) = output { + fs::File::create(out_file)?; + } + + let mut writer = super::create_writer(output.cloned())?; + + // Create the path filter once outside the loop for better performance + let excluded_path_filter = lychee_lib::filter::PathExcludes::new(excluded_paths)?; + + // Collect all sources with deduplication + let mut seen_sources = HashSet::new(); + + for input in inputs { + let sources_stream = input.get_sources( + valid_extensions.clone(), + skip_hidden, + skip_gitignored, + &excluded_path_filter, + ); + tokio::pin!(sources_stream); + + while let Some(source_result) = sources_stream.next().await { + let source = source_result?; + // Only print if we haven't seen this source before + if seen_sources.insert(source.clone()) { + write_out(&mut writer, &source)?; + } + } + } + + Ok(ExitCode::Success) +} + +fn write_out(writer: &mut Box, out_str: &str) -> io::Result<()> { + writeln!(writer, "{out_str}") +} diff --git a/lychee-bin/src/commands/mod.rs b/lychee-bin/src/commands/mod.rs index 8643ad53af..1f00503f02 100644 --- a/lychee-bin/src/commands/mod.rs +++ b/lychee-bin/src/commands/mod.rs @@ -1,10 +1,14 @@ pub(crate) mod check; pub(crate) mod dump; +pub(crate) mod dump_inputs; pub(crate) use check::check; pub(crate) use dump::dump; -pub(crate) use dump::dump_inputs; +pub(crate) use dump_inputs::dump_inputs; +use std::fs; +use std::io::{self, Write}; +use std::path::PathBuf; use std::sync::Arc; use crate::cache::Cache; @@ -19,3 +23,15 @@ pub(crate) struct CommandParams>> { pub(crate) requests: S, pub(crate) cfg: Config, } + +/// Creates a writer that outputs to a file or stdout. +/// +/// # Errors +/// +/// Returns an error if the output file cannot be opened. +fn create_writer(output: Option) -> Result> { + Ok(match output { + Some(path) => Box::new(fs::OpenOptions::new().append(true).open(path)?), + None => Box::new(io::stdout().lock()), + }) +} diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index 18dc6455fa..d3fff2dc10 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -316,28 +316,29 @@ async fn run(opts: &LycheeOptions) -> Result { } }; - let mut collector = Collector::new(opts.config.root_dir.clone(), base)? - .skip_missing_inputs(opts.config.skip_missing) - .skip_hidden(!opts.config.hidden) - .skip_ignored(!opts.config.no_ignore) - .include_verbatim(opts.config.include_verbatim) - .headers(HeaderMap::from_header_pairs(&opts.config.header)?) - .excluded_paths(PathExcludes::new(opts.config.exclude_path.clone())?) - // File a bug if you rely on this envvar! It's going to go away eventually. - .use_html5ever(std::env::var("LYCHEE_USE_HTML5EVER").is_ok_and(|x| x == "1")); - if opts.config.dump_inputs { - let sources = collector.collect_sources(inputs); let exit_code = commands::dump_inputs( - sources, + inputs, opts.config.output.as_ref(), - &PathExcludes::new(&opts.config.exclude_path)?, + &opts.config.exclude_path, + &opts.config.extensions, + !opts.config.hidden, + opts.config.no_ignore, ) .await?; return Ok(exit_code as i32); } + let mut collector = Collector::new(opts.config.root_dir.clone(), base)? + .skip_missing_inputs(opts.config.skip_missing) + .skip_hidden(!opts.config.hidden) + .skip_ignored(!opts.config.no_ignore) + .include_verbatim(opts.config.include_verbatim) + .headers(HeaderMap::from_header_pairs(&opts.config.header)?) + .excluded_paths(PathExcludes::new(opts.config.exclude_path.clone())?) + // File a bug if you rely on this envvar! It's going to go away eventually. + .use_html5ever(std::env::var("LYCHEE_USE_HTML5EVER").is_ok_and(|x| x == "1")); collector = if let Some(ref basic_auth) = opts.config.basic_auth { collector.basic_auth_extractor(BasicAuthExtractor::new(basic_auth)?) } else { diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 371eb04929..afd8c0bbd6 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -4,7 +4,7 @@ mod cli { collections::{HashMap, HashSet}, error::Error, fs::{self, File}, - io::Write, + io::{BufRead, Write}, path::{Path, PathBuf}, time::Duration, }; @@ -79,6 +79,12 @@ mod cli { root_path().join("fixtures") } + /// Helper function to convert a relative path to an absolute path string + /// starting from a base directory. + fn path_str(base: &Path, relative_path: &str) -> String { + base.join(relative_path).to_string_lossy().to_string() + } + #[derive(Default, Serialize)] struct MockResponseStats { detailed_stats: bool, @@ -1844,7 +1850,6 @@ mod cli { .stdout(contains("fixtures/dump_inputs/subfolder/file2.md")) .stdout(contains("fixtures/dump_inputs/subfolder")) .stdout(contains("fixtures/dump_inputs/markdown.md")) - .stdout(contains("fixtures/dump_inputs/subfolder/example.bin")) .stdout(contains("fixtures/dump_inputs/some_file.txt")); Ok(()) @@ -1871,11 +1876,23 @@ mod cli { #[test] fn test_dump_inputs_url() -> Result<()> { let mut cmd = main_command(); - cmd.arg("--dump-inputs") + let output = cmd + .arg("--dump-inputs") .arg("https://example.com") .assert() .success() - .stdout(contains("https://example.com")); + .get_output() + .stdout + .clone(); + + let actual_lines: Vec = output + .lines() + .map(|line| line.unwrap().to_string()) + .collect(); + + let expected_lines = vec!["https://example.com/".to_string()]; + + assert_eq!(actual_lines, expected_lines); Ok(()) } @@ -1883,11 +1900,114 @@ mod cli { #[test] fn test_dump_inputs_path() -> Result<()> { let mut cmd = main_command(); + let output = cmd + .arg("--dump-inputs") + .arg(fixtures_path().join("dump_inputs")) + .assert() + .success() + .get_output() + .stdout + .clone(); + + let mut actual_lines: Vec = output + .lines() + .map(|line| line.unwrap().to_string()) + .collect(); + actual_lines.sort(); + + let base_path = fixtures_path().join("dump_inputs"); + let mut expected_lines = vec![ + path_str(&base_path, "some_file.txt"), + path_str(&base_path, "subfolder/file2.md"), + path_str(&base_path, "subfolder/test.html"), + path_str(&base_path, "markdown.md"), + ]; + expected_lines.sort(); + + assert_eq!(actual_lines, expected_lines); + Ok(()) + } + + // Ensures that dumping stdin does not panic and results in an empty output + // as `stdin` is not a path + #[test] + fn test_dump_inputs_with_extensions() -> Result<()> { + let mut cmd = main_command(); + let test_dir = fixtures_path().join("dump_inputs"); + + let output = cmd + .arg("--dump-inputs") + .arg("--extensions") + .arg("md,txt") + .arg(test_dir) + .assert() + .success() + .get_output() + .stdout + .clone(); + + let mut actual_lines: Vec = output + .lines() + .map(|line| line.unwrap().to_string()) + .collect(); + actual_lines.sort(); + + let base_path = fixtures_path().join("dump_inputs"); + let mut expected_lines = vec![ + path_str(&base_path, "some_file.txt"), + path_str(&base_path, "subfolder/file2.md"), + path_str(&base_path, "markdown.md"), + ]; + expected_lines.sort(); + + assert_eq!(actual_lines, expected_lines); + + // Verify example.bin is not included + for line in &actual_lines { + assert!( + !line.contains("example.bin"), + "Should not contain example.bin: {}", + line + ); + } + + Ok(()) + } + + #[test] + fn test_dump_inputs_skip_hidden() -> Result<()> { + let test_dir = fixtures_path().join("hidden"); + + // Test default behavior (skip hidden) + main_command() + .arg("--dump-inputs") + .arg(&test_dir) + .assert() + .success() + .stdout(is_empty()); + + // Test with --hidden flag + main_command() + .arg("--dump-inputs") + .arg("--hidden") + .arg(test_dir) + .assert() + .success() + .stdout(contains(".hidden/file.md")); + + Ok(()) + } + + #[test] + fn test_dump_inputs_individual_file() -> Result<()> { + let mut cmd = main_command(); + let test_file = fixtures_path().join("TEST.md"); + cmd.arg("--dump-inputs") - .arg("fixtures") + .arg(&test_file) .assert() .success() - .stdout(contains("fixtures")); + .stdout(contains("fixtures/TEST.md")); Ok(()) } @@ -1895,11 +2015,12 @@ mod cli { #[test] fn test_dump_inputs_stdin() -> Result<()> { let mut cmd = main_command(); + cmd.arg("--dump-inputs") .arg("-") .assert() .success() - .stdout(contains("Stdin")); + .stdout(contains("")); Ok(()) } diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index f3ac22f7f6..d8bfd6a5cc 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -158,10 +158,39 @@ impl Collector { /// Collect all sources from a list of [`Input`]s. For further details, /// see also [`Input::get_sources`](crate::Input#method.get_sources). pub fn collect_sources(self, inputs: HashSet) -> impl Stream> { + self.collect_sources_with_file_types(inputs, crate::types::FileType::default_extensions()) + } + + /// Collect all sources from a list of [`Input`]s with specific file extensions. + pub fn collect_sources_with_file_types( + self, + inputs: HashSet, + file_extensions: FileExtensions, + ) -> impl Stream> + 'static { let seen = Arc::new(DashSet::new()); + let skip_hidden = self.skip_hidden; + let skip_ignored = self.skip_ignored; + let excluded_paths = self.excluded_paths; stream::iter(inputs) - .par_then_unordered(None, move |input| async move { input.get_sources() }) + .par_then_unordered(None, move |input| { + let excluded_paths = excluded_paths.clone(); + let file_extensions = file_extensions.clone(); + async move { + let input_sources = input.get_input_sources( + file_extensions, + skip_hidden, + skip_ignored, + &excluded_paths, + ); + + input_sources + .map(|source| source.map(|source| source.to_string())) + .collect::>() + .await + } + }) + .map(stream::iter) .flatten() .filter_map({ move |source: Result| { diff --git a/lychee-lib/src/types/file.rs b/lychee-lib/src/types/file.rs index 36fb8212d6..85fcce3154 100644 --- a/lychee-lib/src/types/file.rs +++ b/lychee-lib/src/types/file.rs @@ -223,24 +223,21 @@ mod tests { #[test] fn test_extension() { - assert_eq!(FileType::from(Path::new("foo.md")), FileType::Markdown); - assert_eq!(FileType::from(Path::new("foo.MD")), FileType::Markdown); - assert_eq!(FileType::from(Path::new("foo.mdx")), FileType::Markdown); + assert_eq!(FileType::from("foo.md"), FileType::Markdown); + assert_eq!(FileType::from("foo.MD"), FileType::Markdown); + assert_eq!(FileType::from("foo.mdx"), FileType::Markdown); - assert_eq!( - FileType::from(Path::new("test.unknown")), - FileType::Plaintext - ); - assert_eq!(FileType::from(Path::new("test")), FileType::Plaintext); - assert_eq!(FileType::from(Path::new("test.txt")), FileType::Plaintext); - assert_eq!(FileType::from(Path::new("README.TXT")), FileType::Plaintext); + // Test that a file without an extension is considered plaintext + assert_eq!(FileType::from("README"), FileType::Plaintext); + assert_eq!(FileType::from("test"), FileType::Plaintext); - assert_eq!(FileType::from(Path::new("test.htm")), FileType::Html); - assert_eq!(FileType::from(Path::new("index.html")), FileType::Html); - assert_eq!( - FileType::from(Path::new("http://foo.com/index.html")), - FileType::Html - ); + assert_eq!(FileType::from("test.unknown"), FileType::Plaintext); + assert_eq!(FileType::from("test.txt"), FileType::Plaintext); + assert_eq!(FileType::from("README.TXT"), FileType::Plaintext); + + assert_eq!(FileType::from("test.htm"), FileType::Html); + assert_eq!(FileType::from("index.html"), FileType::Html); + assert_eq!(FileType::from("http://foo.com/index.html"), FileType::Html); } #[test] diff --git a/lychee-lib/src/types/input.rs b/lychee-lib/src/types/input.rs index 145d22f104..915e23be2b 100644 --- a/lychee-lib/src/types/input.rs +++ b/lychee-lib/src/types/input.rs @@ -4,6 +4,7 @@ use crate::filter::PathExcludes; use crate::types::FileType; use crate::{ErrorKind, Result}; use async_stream::try_stream; +use futures::StreamExt; use futures::stream::Stream; use glob::glob_with; use ignore::WalkBuilder; @@ -209,6 +210,106 @@ impl Input { } } + /// Get all input sources for content processing. + /// + /// This method returns a stream of input sources for the given input, taking into + /// account the matching file extensions and respecting exclusions. + /// + /// This can be used for retrieving all inputs which lychee would check for + /// links. + /// + /// # Returns + /// + /// Returns a stream of `Result` for all matching input sources. + /// + /// # Errors + /// + /// Will return errors for file system operations or glob pattern issues + pub fn get_input_sources<'a>( + &'a self, + file_extensions: FileExtensions, + skip_hidden: bool, + skip_gitignored: bool, + excluded_paths: &'a PathExcludes, + ) -> impl Stream> + 'a { + try_stream! { + match &self.source { + InputSource::RemoteUrl(url) => { + // Yield the remote URL as an input source + yield InputSource::RemoteUrl(url.clone()); + }, + InputSource::FsGlob { pattern, ignore_case } => { + // For glob patterns, we expand the pattern and yield matching paths + let glob_expanded = tilde(pattern).to_string(); + let mut match_opts = glob::MatchOptions::new(); + match_opts.case_sensitive = !ignore_case; + + for entry in glob_with(&glob_expanded, match_opts)? { + match entry { + Ok(path) => { + // Skip directories or files that don't match extensions + if path.is_dir() { + continue; + } + if Self::is_excluded_path(&path, excluded_paths) { + continue; + } + + // Check if it matches one of our file extensions + if file_extensions_match(&path, &file_extensions) { + yield InputSource::FsPath(path); + } + } + Err(e) => { + eprintln!("Error in glob pattern: {e:?}"); + } + } + } + }, + InputSource::FsPath(path) => { + if path.is_dir() { + + for entry in Input::walk_entries( + path, + file_extensions, + skip_hidden, + skip_gitignored, + )? + { + let entry = entry?; + if Self::is_excluded_path(entry.path(), excluded_paths) { + continue; + } + match entry.file_type() { + None => continue, + Some(file_type) => { + if !file_type.is_file() { + continue; + } + } + } + + yield InputSource::FsPath(entry.path().to_path_buf()); + } + } else { + // For individual files, yield if not excluded and matches extensions + if !Self::is_excluded_path(path, excluded_paths) && file_extensions_match(path, &file_extensions) { + yield InputSource::FsPath(path.clone()); + } + } + }, + InputSource::Stdin => { + // Yield stdin as an input source + yield InputSource::Stdin; + }, + InputSource::String(_) => { + // Yield the string source + yield self.source.clone(); + } + } + } + } + /// Retrieve the contents from the input /// /// If the input is a path, only search through files that match the given @@ -231,6 +332,7 @@ impl Input { excluded_paths: PathExcludes, ) -> impl Stream> { try_stream! { + // Handle special cases first match self.source { InputSource::RemoteUrl(url) => { let content = resolver.url_contents(*url).await; @@ -239,63 +341,84 @@ impl Input { Err(e) => Err(e)?, Ok(content) => yield content, } - }, - InputSource::FsGlob { - ref pattern, - ignore_case, - } => { - for await content in glob_contents(pattern, ignore_case, &excluded_paths) { - let content = content?; - yield content; - } - } - InputSource::FsPath(ref path) => { - if path.is_dir() { - for entry in WalkBuilder::new(path) - .standard_filters(skip_gitignored) - .types(file_extensions.try_into()?) - .hidden(skip_hidden) - .build() - { - let entry = entry?; - if is_excluded_path(&excluded_paths, entry.path()) { - continue; - } - match entry.file_type() { - None => continue, - Some(file_type) => { - if !file_type.is_file() { - continue; - } - } - } - let content = Self::path_content(entry.path()).await?; - yield content; - } - } else { - if is_excluded_path(&excluded_paths, path) { - return; - } - let content = Self::path_content(path).await; - match content { - Err(_) if skip_missing => (), - Err(e) => Err(e)?, - Ok(content) => yield content, - } - } + return; }, InputSource::Stdin => { let content = Self::stdin_content(self.file_type_hint).await?; yield content; + return; }, InputSource::String(ref s) => { let content = Self::string_content(s, self.file_type_hint); yield content; + return; }, + _ => {} + } + + // Handle FsPath and FsGlob sources + // We can use `get_input_sources` to get the input sources, which will handle + // filtering by file extensions and exclusions + let mut sources_stream = Box::pin(self.get_input_sources(file_extensions, skip_hidden, skip_gitignored, &excluded_paths)); + + while let Some(source_result) = sources_stream.next().await { + match source_result { + Ok(source) => { + match source { + InputSource::FsPath(path) => { + // Process the actual file path + let content = Self::path_content(&path).await; + match content { + Err(_) if skip_missing => (), + Err(e) => Err(e)?, + Ok(content) => yield content, + } + }, + InputSource::RemoteUrl(url) => { + let content = resolver.url_contents(*url).await; + match content { + Err(_) if skip_missing => (), + Err(e) => Err(e)?, + Ok(content) => yield content, + } + }, + InputSource::Stdin => { + let content = Self::stdin_content(self.file_type_hint).await?; + yield content; + }, + InputSource::String(s) => { + let content = Self::string_content(&s, self.file_type_hint); + yield content; + }, + InputSource::FsGlob { .. } => { + unreachable!("This shouldn't happen as `get_input_sources` expands the glob patterns"); + } + } + }, + Err(e) => Err(e)?, + } } } } + /// Create a `WalkBuilder` for directory traversal with consistent settings + fn walk_entries( + path: &Path, + file_extensions: FileExtensions, + skip_hidden: bool, + skip_gitignored: bool, + ) -> Result { + Ok(WalkBuilder::new(path) + // Enable standard filters if `skip_gitignored `is true. + // This will skip files ignored by `.gitignore` and other VCS ignore files. + .standard_filters(skip_gitignored) + // Override hidden file behavior to be controlled by the separate skip_hidden parameter + .hidden(skip_hidden) + // Configure the file types filter to only include files with matching extensions + .types(file_extensions.try_into()?) + .build()) + } + /// Retrieve all sources from this input. The output depends on the type of /// input: /// @@ -307,30 +430,64 @@ impl Input { /// # Errors /// /// Returns an error if the globbing fails with the expanded pattern. - pub fn get_sources(self) -> impl Stream> { + pub fn get_sources( + self, + file_extensions: FileExtensions, + skip_hidden: bool, + skip_gitignored: bool, + excluded_paths: &PathExcludes, + ) -> impl Stream> { try_stream! { match self.source { InputSource::RemoteUrl(url) => yield url.to_string(), - InputSource::FsGlob { pattern, ignore_case } => { + InputSource::FsGlob { + ref pattern, + ignore_case, + } => { let glob_expanded = tilde(&pattern).to_string(); let mut match_opts = glob::MatchOptions::new(); - match_opts.case_sensitive = !ignore_case; - for entry in glob_with(&glob_expanded, match_opts)? { match entry { - Ok(path) => yield path.to_string_lossy().to_string(), - Err(e) => eprintln!("{e:?}") + Ok(path) => { + if !Self::is_excluded_path(&path, excluded_paths) { + yield path.to_string_lossy().to_string(); + } + }, + Err(e) => eprintln!("{e:?}"), } } - }, - InputSource::FsPath(path) => yield path.to_string_lossy().to_string(), - InputSource::Stdin => yield "Stdin".into(), - InputSource::String(_) => yield "Raw String".into(), + } + InputSource::FsPath(ref path) => { + if path.is_dir() { + for entry in Input::walk_entries( + path, + file_extensions, + skip_hidden, + skip_gitignored, + )? { + let entry = entry?; + if !Self::is_excluded_path(entry.path(), excluded_paths) { + // Only yield files, not directories + if entry.file_type().is_some_and(|ft| ft.is_file()) { + yield entry.path().to_string_lossy().to_string(); + } + } + } + } else if !Self::is_excluded_path(path, excluded_paths) { + yield path.to_string_lossy().to_string(); + } + } + InputSource::Stdin => yield "".into(), + InputSource::String(_) => yield "".into(), } } } + /// Check if the given path was excluded from link checking + fn is_excluded_path(path: &Path, excluded_paths: &PathExcludes) -> bool { + excluded_paths.is_match(&path.to_string_lossy()) + } /// Get the input content of a given path /// # Errors /// @@ -378,51 +535,32 @@ impl TryFrom<&str> for Input { } } -fn glob_contents( - pattern: &str, - ignore_case: bool, - excluded_paths: &PathExcludes, -) -> impl Stream> { - let glob_expanded = tilde(&pattern).to_string(); - let mut match_opts = glob::MatchOptions::new(); - - match_opts.case_sensitive = !ignore_case; - - try_stream! { - for entry in glob_with(&glob_expanded, match_opts)? { - match entry { - Ok(path) => { - // Directories can have a suffix which looks like - // a file extension (like `foo.html`). This can lead to - // unexpected behavior with glob patterns like - // `**/*.html`. Therefore filter these out. - // See - if path.is_dir() { - continue; - } - if is_excluded_path(excluded_paths, &path) { - continue; - } - let content: InputContent = Input::path_content(&path).await?; - yield content; - } - Err(e) => eprintln!("{e:?}"), - } - } +/// Helper function to check if a file path matches any of the given extensions +fn file_extensions_match(path: &Path, extensions: &FileExtensions) -> bool { + // If the path has no extension, check if we accept plaintext files + // NOTE: We treat files without extensions as plaintext, which might be problematic + // and is therefore subject to change + if path.extension().is_none() { + return extensions.contains("txt") || extensions.contains(""); } -} -/// Function for path exclusion tests -/// -/// This is a standalone function to allow for easier testing -fn is_excluded_path(excluded_paths: &PathExcludes, path: &Path) -> bool { - excluded_paths.is_match(&path.to_string_lossy()) + // Otherwise, check if the extension is in our allowed list + path.extension() + .and_then(|ext| ext.to_str()) + .is_some_and(|ext| extensions.contains(ext.to_lowercase())) } #[cfg(test)] mod tests { use super::*; + /// Function for path exclusion tests + /// + /// This is a standalone function to allow for easier testing + pub fn is_excluded_path(excluded_paths: &PathExcludes, path: &Path) -> bool { + excluded_paths.is_match(&path.to_string_lossy()) + } + #[test] fn test_input_handles_real_relative_paths() { let test_file = "./Cargo.toml";