From ceb19f810aaf8d65a0264a57365e5b88cea55b2e Mon Sep 17 00:00:00 2001 From: Matthias Date: Thu, 7 Nov 2024 18:49:20 +0100 Subject: [PATCH 01/12] Add support for custom file extensions in link checking. This adds support for overwriting extensions: ``` lychee . --extensions md,html,txt,json,yaml ``` The above would only check these extensions. This was enabled by moving to `ignore` (#1500 by @thomas-zahner). Fixes #410 --- README.md | 7 ++++ lychee-bin/src/main.rs | 2 +- lychee-bin/src/options.rs | 25 ++++++++++++- lychee-lib/src/collector.rs | 29 +++++++++++---- lychee-lib/src/types/file.rs | 70 +++++++++++++++++++++++++---------- lychee-lib/src/types/input.rs | 41 +++++++++----------- 6 files changed, 120 insertions(+), 54 deletions(-) diff --git a/README.md b/README.md index 47a26be789..492eca13f8 100644 --- a/README.md +++ b/README.md @@ -334,6 +334,13 @@ Options: Do not show progress bar. This is recommended for non-interactive shells (e.g. for continuous integration) + --extensions + Test the specified file extensions for URIs when checking files locally. + + Multiple extensions can be separated by commas. Note that if you want to check filetypes, + which have multiple extensions, e.g. HTML files with both .html and .htm extensions, you need to + specify both extensions explicitly. + --cache Use request cache stored on disk at `.lycheecache` diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index 6c0b48b733..035be062f1 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -332,7 +332,7 @@ async fn run(opts: &LycheeOptions) -> Result { collector }; - let requests = collector.collect_links(inputs); + let requests = collector.collect_links_with_ext(inputs, opts.config.extensions.clone()); let cache = load_cache(&opts.config).unwrap_or_default(); let cache = Arc::new(cache); diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index 25edf5e585..d13a60b19f 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -6,8 +6,9 @@ use clap::builder::PossibleValuesParser; use clap::{arg, builder::TypedValueParser, Parser}; use const_format::{concatcp, formatcp}; use lychee_lib::{ - Base, BasicAuthSelector, Input, StatusCodeExcluder, StatusCodeSelector, DEFAULT_MAX_REDIRECTS, - DEFAULT_MAX_RETRIES, DEFAULT_RETRY_WAIT_TIME_SECS, DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT, + Base, BasicAuthSelector, FileType, Input, StatusCodeExcluder, StatusCodeSelector, + DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES, DEFAULT_RETRY_WAIT_TIME_SECS, DEFAULT_TIMEOUT_SECS, + DEFAULT_USER_AGENT, }; use secrecy::{ExposeSecret, SecretString}; use serde::Deserialize; @@ -228,6 +229,25 @@ pub(crate) struct Config { #[serde(default)] pub(crate) no_progress: bool, + /// A list of custom extensions for link checking + /// + /// E.g. a user can specify `--extensions html,htm,php,asp,aspx,jsp,cgi` + /// to check for links in files with these extensions. + /// + /// This is useful when the default extensions are not enough and you don't + /// want to provide a long list of inputs (e.g. file1.html, file2.md, etc.) + #[arg( + long, + value_delimiter = ',', + long_help = "Test the specified file extensions for URIs when checking files locally. + +Multiple extensions can be separated by commas. Note that if you want to check filetypes, +which have multiple extensions, e.g. HTML files with both .html and .htm extensions, you need to +specify both extensions explicitly." + )] + #[serde(default = "FileType::default_extensions")] + pub(crate) extensions: Vec, + #[arg(help = HELP_MSG_CACHE)] #[arg(long)] #[serde(default)] @@ -584,6 +604,7 @@ impl Config { cookie_jar: None; include_fragments: false; accept: StatusCodeSelector::default(); + extensions: FileType::default_extensions(); } if self diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 7eb4c8c82b..4ea4279c25 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -119,6 +119,12 @@ impl Collector { .flatten() } + /// Convenience method to fetch all unique links from inputs + /// with the default extensions. + pub fn collect_links(self, inputs: Vec) -> impl Stream> { + self.collect_links_with_ext(inputs, crate::types::FileType::default_extensions()) + } + /// Fetch all unique links from inputs /// All relative URLs get prefixed with `base` (if given). /// (This can be a directory or a base URL) @@ -126,7 +132,11 @@ impl Collector { /// # Errors /// /// Will return `Err` if links cannot be extracted from an input - pub fn collect_links(self, inputs: Vec) -> impl Stream> { + pub fn collect_links_with_ext( + self, + inputs: Vec, + extensions: Vec, + ) -> impl Stream> { let skip_missing_inputs = self.skip_missing_inputs; let skip_hidden = self.skip_hidden; let skip_ignored = self.skip_ignored; @@ -134,13 +144,14 @@ impl Collector { stream::iter(inputs) .par_then_unordered(None, move |input| { let default_base = global_base.clone(); + let extensions = extensions.clone(); async move { let base = match &input.source { InputSource::RemoteUrl(url) => Base::try_from(url.as_str()).ok(), _ => default_base, }; input - .get_contents(skip_missing_inputs, skip_hidden, skip_ignored) + .get_contents(skip_missing_inputs, skip_hidden, skip_ignored, extensions) .map(move |content| (content, base.clone())) } }) @@ -191,15 +202,19 @@ mod tests { Ok(responses.map(|r| r.unwrap().uri).collect().await) } - // Helper function for collecting verbatim links + /// Helper function for collecting verbatim links + /// + /// A verbatim link is a link that is not parsed by the HTML parser. + /// For example, a link in a code block or a script tag. async fn collect_verbatim( inputs: Vec, root_dir: Option, base: Option, + extensions: Vec, ) -> Result> { let responses = Collector::new(root_dir, base)? .include_verbatim(true) - .collect_links(inputs); + .collect_links_with_ext(inputs, extensions); Ok(responses.map(|r| r.unwrap().uri).collect().await) } @@ -217,7 +232,7 @@ mod tests { let _file = File::create(&file_path).unwrap(); let input = Input::new(&file_path.as_path().display().to_string(), None, true, None)?; let contents: Vec<_> = input - .get_contents(true, true, true) + .get_contents(true, true, true, FileType::default_extensions()) .collect::>() .await; @@ -230,7 +245,7 @@ mod tests { async fn test_url_without_extension_is_html() -> Result<()> { let input = Input::new("https://example.com/", None, true, None)?; let contents: Vec<_> = input - .get_contents(true, true, true) + .get_contents(true, true, true, FileType::default_extensions()) .collect::>() .await; @@ -288,7 +303,7 @@ mod tests { }, ]; - let links = collect_verbatim(inputs, None, None).await.ok().unwrap(); + let links = collect_verbatim(inputs, None, None, FileType::default_extensions()).await.ok().unwrap(); let expected_links = HashSet::from_iter([ website(TEST_STRING), diff --git a/lychee-lib/src/types/file.rs b/lychee-lib/src/types/file.rs index eb1ea3535b..13461704a4 100644 --- a/lychee-lib/src/types/file.rs +++ b/lychee-lib/src/types/file.rs @@ -12,28 +12,46 @@ pub enum FileType { Plaintext, } +impl FileType { + /// All known Markdown extensions + const MARKDOWN_EXTENSIONS: &'static [&'static str] = &[ + "markdown", "mkdown", "mkdn", "mdwn", "mdown", "mdx", "mkd", "md", + ]; + + /// All known HTML extensions + const HTML_EXTENSIONS: &'static [&'static str] = &["htm", "html"]; + + /// Default extensions which are supported by lychee + #[must_use] + pub fn default_extensions() -> Vec { + let mut extensions = Vec::new(); + extensions.extend(Self::MARKDOWN_EXTENSIONS.iter().map(|&s| s.to_string())); + extensions.extend(Self::HTML_EXTENSIONS.iter().map(|&s| s.to_string())); + extensions + } + + /// Get the [`FileType`] from an extension string + fn from_extension(ext: &str) -> Option { + let ext = ext.to_lowercase(); + if Self::MARKDOWN_EXTENSIONS.contains(&ext.as_str()) { + Some(Self::Markdown) + } else if Self::HTML_EXTENSIONS.contains(&ext.as_str()) { + Some(Self::Html) + } else { + None + } + } +} + impl Default for FileType { fn default() -> Self { + // This is the default file type when no other type can be determined. + // It represents a generic text file with no specific syntax. Self::Plaintext } } impl> From

for FileType { - /// Detect if the given path points to a Markdown, HTML, or plaintext file. - // - // Assume HTML in case of no extension. - // - // This is only reasonable for URLs, not paths on disk. For example, - // a file named `README` without an extension is more likely to be a - // plaintext file. - // - // A better solution would be to also implement `From for - // FileType`. Unfortunately that's not possible without refactoring, as - // `AsRef` could be implemented for `Url` in the future, which is - // why `From for FileType` is not allowed (orphan rule). - // - // As a workaround, we check if the scheme is `http` or `https` and - // assume HTML in that case. fn from(p: P) -> FileType { let path = p.as_ref(); match path @@ -41,12 +59,9 @@ impl> From

for FileType { .and_then(std::ffi::OsStr::to_str) .map(str::to_lowercase) .as_deref() + .and_then(FileType::from_extension) { - // https://superuser.com/a/285878 - Some("markdown" | "mkdown" | "mkdn" | "mdwn" | "mdown" | "mdx" | "mkd" | "md") => { - FileType::Markdown - } - Some("htm" | "html") => FileType::Html, + Some(file_type) => file_type, None if is_url(path) => FileType::Html, _ => FileType::default(), } @@ -86,6 +101,21 @@ mod tests { ); } + #[test] + fn test_default_extensions() { + let extensions = FileType::default_extensions(); + // Test some known extensions + assert!(extensions.contains(&"md".to_string())); + assert!(extensions.contains(&"html".to_string())); + assert!(extensions.contains(&"markdown".to_string())); + assert!(extensions.contains(&"htm".to_string())); + // Test the count matches our static arrays + assert_eq!( + extensions.len(), + FileType::MARKDOWN_EXTENSIONS.len() + FileType::HTML_EXTENSIONS.len() + ); + } + #[test] fn test_is_url() { // Valid URLs diff --git a/lychee-lib/src/types/input.rs b/lychee-lib/src/types/input.rs index c32be7feb8..47aa3a1758 100644 --- a/lychee-lib/src/types/input.rs +++ b/lychee-lib/src/types/input.rs @@ -3,6 +3,7 @@ use crate::{utils, ErrorKind, Result}; use async_stream::try_stream; use futures::stream::Stream; use glob::glob_with; +use ignore::types::TypesBuilder; use ignore::WalkBuilder; use reqwest::Url; use serde::{Deserialize, Serialize}; @@ -14,12 +15,6 @@ use tokio::io::{stdin, AsyncReadExt}; const STDIN: &str = "-"; -// Check the extension of the given path against the list of known/accepted -// file extensions -fn valid_extension(p: &Path) -> bool { - matches!(FileType::from(p), FileType::Markdown | FileType::Html) -} - #[derive(Debug)] /// Encapsulates the content for a given input pub struct InputContent { @@ -209,6 +204,7 @@ impl Input { skip_missing: bool, skip_hidden: bool, skip_gitignored: bool, + extensions: Vec, ) -> impl Stream> { try_stream! { match self.source { @@ -231,24 +227,32 @@ impl Input { } InputSource::FsPath(ref path) => { if path.is_dir() { - for entry in WalkBuilder::new(path).standard_filters(skip_gitignored).hidden(skip_hidden).build() { - let entry = entry?; + let mut types_builder = TypesBuilder::new(); + for ext in extensions { + types_builder.add(&ext, &format!("*.{ext}"))?; + } + + for entry in WalkBuilder::new(path) + .standard_filters(skip_gitignored) + .types(types_builder.select("all").build()?) + .hidden(skip_hidden) + .build() + { + let entry = entry?; if self.is_excluded_path(&entry.path().to_path_buf()) { continue; } - match entry.file_type() { None => continue, Some(file_type) => { - if !file_type.is_file() || !valid_extension(entry.path()) { + if !file_type.is_file() { continue; } } - }; - + } let content = Self::path_content(entry.path()).await?; - yield content + yield content; } } else { if self.is_excluded_path(path) { @@ -459,17 +463,6 @@ mod tests { assert!(matches!(input, Err(ErrorKind::InvalidFile(PathBuf { .. })))); } - #[test] - fn test_valid_extension() { - assert!(valid_extension(Path::new("file.md"))); - assert!(valid_extension(Path::new("file.markdown"))); - assert!(valid_extension(Path::new("file.html"))); - assert!(valid_extension(Path::new("file.htm"))); - assert!(valid_extension(Path::new("file.HTM"))); - assert!(!valid_extension(Path::new("file.txt"))); - assert!(!valid_extension(Path::new("file"))); - } - #[test] fn test_no_exclusions() { let dir = tempfile::tempdir().unwrap(); From 5b6ad51f29cba9c0c7c4fa325fef34e9667b4528 Mon Sep 17 00:00:00 2001 From: Matthias Endler Date: Sat, 9 Nov 2024 00:47:24 +0100 Subject: [PATCH 02/12] Update lychee-bin/src/options.rs Co-authored-by: Thomas Zahner --- lychee-bin/src/options.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index d13a60b19f..64aeffa692 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -229,7 +229,7 @@ pub(crate) struct Config { #[serde(default)] pub(crate) no_progress: bool, - /// A list of custom extensions for link checking + /// A list of file extensions. Files not matching the specified extensions are skipped. /// /// E.g. a user can specify `--extensions html,htm,php,asp,aspx,jsp,cgi` /// to check for links in files with these extensions. From f60619b7acfcff1e2dc2f7c0a276f1206dd95ee2 Mon Sep 17 00:00:00 2001 From: Matthias Date: Wed, 26 Feb 2025 00:30:25 +0100 Subject: [PATCH 03/12] Refactor file extension handling to use FileExtensions type for improved clarity and functionality --- lychee-bin/src/main.rs | 2 +- lychee-bin/src/options.rs | 16 +++-- lychee-lib/src/collector.rs | 19 ++--- lychee-lib/src/lib.rs | 6 +- lychee-lib/src/types/file.rs | 130 +++++++++++++++++++++++++++++++--- lychee-lib/src/types/input.rs | 18 ++--- lychee-lib/src/types/mod.rs | 2 +- 7 files changed, 154 insertions(+), 39 deletions(-) diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index 035be062f1..36dffb9686 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -332,7 +332,7 @@ async fn run(opts: &LycheeOptions) -> Result { collector }; - let requests = collector.collect_links_with_ext(inputs, opts.config.extensions.clone()); + let requests = collector.collect_links_from_file_types(inputs, opts.config.extensions.clone()); let cache = load_cache(&opts.config).unwrap_or_default(); let cache = Arc::new(cache); diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index 64aeffa692..5a4deb8c4c 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -6,9 +6,9 @@ use clap::builder::PossibleValuesParser; use clap::{arg, builder::TypedValueParser, Parser}; use const_format::{concatcp, formatcp}; use lychee_lib::{ - Base, BasicAuthSelector, FileType, Input, StatusCodeExcluder, StatusCodeSelector, - DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES, DEFAULT_RETRY_WAIT_TIME_SECS, DEFAULT_TIMEOUT_SECS, - DEFAULT_USER_AGENT, + Base, BasicAuthSelector, FileExtensions, FileType, Input, StatusCodeExcluder, + StatusCodeSelector, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES, DEFAULT_RETRY_WAIT_TIME_SECS, + DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT, }; use secrecy::{ExposeSecret, SecretString}; use serde::Deserialize; @@ -238,15 +238,17 @@ pub(crate) struct Config { /// want to provide a long list of inputs (e.g. file1.html, file2.md, etc.) #[arg( long, - value_delimiter = ',', + value_parser = |s: &str| -> Result { + Ok(FileExtensions::from(s.split(',').map(String::from).collect::>())) + }, long_help = "Test the specified file extensions for URIs when checking files locally. - + Multiple extensions can be separated by commas. Note that if you want to check filetypes, which have multiple extensions, e.g. HTML files with both .html and .htm extensions, you need to specify both extensions explicitly." )] - #[serde(default = "FileType::default_extensions")] - pub(crate) extensions: Vec, + #[serde(default = "FileExtensions::default")] + pub(crate) extensions: FileExtensions, #[arg(help = HELP_MSG_CACHE)] #[arg(long)] diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 4ea4279c25..7794d3b058 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -1,8 +1,8 @@ use crate::ErrorKind; use crate::InputSource; use crate::{ - basic_auth::BasicAuthExtractor, extract::Extractor, types::uri::raw::RawUri, utils::request, - Base, Input, Request, Result, + basic_auth::BasicAuthExtractor, extract::Extractor, types::uri::raw::RawUri, + types::FileExtensions, utils::request, Base, Input, Request, Result, }; use futures::TryStreamExt; use futures::{ @@ -122,7 +122,7 @@ impl Collector { /// Convenience method to fetch all unique links from inputs /// with the default extensions. pub fn collect_links(self, inputs: Vec) -> impl Stream> { - self.collect_links_with_ext(inputs, crate::types::FileType::default_extensions()) + self.collect_links_from_file_types(inputs, crate::types::FileType::default_extensions()) } /// Fetch all unique links from inputs @@ -132,10 +132,10 @@ impl Collector { /// # Errors /// /// Will return `Err` if links cannot be extracted from an input - pub fn collect_links_with_ext( + pub fn collect_links_from_file_types( self, inputs: Vec, - extensions: Vec, + extensions: FileExtensions, ) -> impl Stream> { let skip_missing_inputs = self.skip_missing_inputs; let skip_hidden = self.skip_hidden; @@ -210,11 +210,11 @@ mod tests { inputs: Vec, root_dir: Option, base: Option, - extensions: Vec, + extensions: FileExtensions, ) -> Result> { let responses = Collector::new(root_dir, base)? .include_verbatim(true) - .collect_links_with_ext(inputs, extensions); + .collect_links_from_file_types(inputs, extensions); Ok(responses.map(|r| r.unwrap().uri).collect().await) } @@ -303,7 +303,10 @@ mod tests { }, ]; - let links = collect_verbatim(inputs, None, None, FileType::default_extensions()).await.ok().unwrap(); + let links = collect_verbatim(inputs, None, None, FileType::default_extensions()) + .await + .ok() + .unwrap(); let expected_links = HashSet::from_iter([ website(TEST_STRING), diff --git a/lychee-lib/src/lib.rs b/lychee-lib/src/lib.rs index 023278c034..bbaca0607e 100644 --- a/lychee-lib/src/lib.rs +++ b/lychee-lib/src/lib.rs @@ -97,8 +97,8 @@ pub use crate::{ filter::{Excludes, Filter, Includes}, types::{ uri::valid::Uri, AcceptRange, AcceptRangeError, Base, BasicAuthCredentials, - BasicAuthSelector, CacheStatus, CookieJar, ErrorKind, FileType, Input, InputContent, - InputSource, Request, Response, ResponseBody, Result, Status, StatusCodeExcluder, - StatusCodeSelector, + BasicAuthSelector, CacheStatus, CookieJar, ErrorKind, FileExtensions, FileType, Input, + InputContent, InputSource, Request, Response, ResponseBody, Result, Status, + StatusCodeExcluder, StatusCodeSelector, }, }; diff --git a/lychee-lib/src/types/file.rs b/lychee-lib/src/types/file.rs index 13461704a4..37d6c2ff93 100644 --- a/lychee-lib/src/types/file.rs +++ b/lychee-lib/src/types/file.rs @@ -1,7 +1,98 @@ +use ignore::types::{Types, TypesBuilder}; +use serde::{Deserialize, Serialize}; use std::path::Path; use url::Url; -#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +/// Represents an ordered list of file extensions. +/// +/// This holds the actual extension strings (e.g. `md`, `html`, etc.) and is +/// used to build a [`Types`] object which can be used to match file types. +/// +/// In a sense, it is more "low-level" than [`FileType`] as it is closer to the +/// actual representation of file extensions, while [`FileType`] is a higher-level +/// abstraction that represents the "category" of a file (e.g. Markdown, HTML). +/// +/// The order is significant as extensions at the beginning of the vector will +/// be treated with higher priority (e.g. when deciding which file to pick out +/// of a set of options) +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)] +pub struct FileExtensions(Vec); + +impl Default for FileExtensions { + fn default() -> Self { + FileType::default().into() + } +} + +impl FileExtensions { + /// Create an empty list of file extensions + #[must_use] + pub const fn empty() -> Self { + Self(vec![]) + } + + /// Extend the list of existing extensions by the values from the iterator + pub fn extend>(&mut self, iter: I) { + self.0.extend(iter); + } + + /// Check if the list of file extensions contains the given file extension + pub fn contains>(&self, file_extension: T) -> bool { + self.0.contains(&file_extension.into()) + } + + /// Build the current list of file extensions into a file type matcher. + /// + /// # Errors + /// + /// Fails if an extension is `all` or otherwise contains any character that + /// is not a Unicode letter or number. + pub fn all(&self) -> super::Result { + let mut types_builder = TypesBuilder::new(); + for ext in self.0.clone() { + types_builder.add(&ext, &format!("*.{ext}"))?; + } + Ok(types_builder.select("all").build()?) + } +} + +impl From for Vec { + fn from(value: FileExtensions) -> Self { + value.0 + } +} + +impl From> for FileExtensions { + fn from(value: Vec) -> Self { + Self(value) + } +} + +impl From for FileExtensions { + fn from(file_type: FileType) -> Self { + match file_type { + FileType::Html => FileType::html_extensions(), + FileType::Markdown => FileType::markdown_extensions(), + FileType::Plaintext => Self::empty(), + } + } +} + +impl FromIterator for FileExtensions { + fn from_iter>(iter: T) -> Self { + Self(iter.into_iter().collect()) + } +} + +impl Iterator for FileExtensions { + type Item = String; + + fn next(&mut self) -> Option { + self.0.pop() + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, Deserialize, Serialize)] /// `FileType` defines which file types lychee can handle pub enum FileType { /// File in HTML format @@ -23,13 +114,31 @@ impl FileType { /// Default extensions which are supported by lychee #[must_use] - pub fn default_extensions() -> Vec { - let mut extensions = Vec::new(); - extensions.extend(Self::MARKDOWN_EXTENSIONS.iter().map(|&s| s.to_string())); - extensions.extend(Self::HTML_EXTENSIONS.iter().map(|&s| s.to_string())); + pub fn default_extensions() -> FileExtensions { + let mut extensions = FileExtensions::empty(); + extensions.extend(Self::markdown_extensions()); + extensions.extend(Self::html_extensions()); extensions } + /// All known Markdown extensions + #[must_use] + pub fn markdown_extensions() -> FileExtensions { + Self::MARKDOWN_EXTENSIONS + .iter() + .map(|&s| s.to_string()) + .collect() + } + + /// All known HTML extensions + #[must_use] + pub fn html_extensions() -> FileExtensions { + Self::HTML_EXTENSIONS + .iter() + .map(|&s| s.to_string()) + .collect() + } + /// Get the [`FileType`] from an extension string fn from_extension(ext: &str) -> Option { let ext = ext.to_lowercase(); @@ -105,13 +214,14 @@ mod tests { fn test_default_extensions() { let extensions = FileType::default_extensions(); // Test some known extensions - assert!(extensions.contains(&"md".to_string())); - assert!(extensions.contains(&"html".to_string())); - assert!(extensions.contains(&"markdown".to_string())); - assert!(extensions.contains(&"htm".to_string())); + assert!(extensions.contains("md")); + assert!(extensions.contains("html")); + assert!(extensions.contains("markdown")); + assert!(extensions.contains("htm")); // Test the count matches our static arrays + let all_extensions: Vec<_> = extensions.into(); assert_eq!( - extensions.len(), + all_extensions.len(), FileType::MARKDOWN_EXTENSIONS.len() + FileType::HTML_EXTENSIONS.len() ); } diff --git a/lychee-lib/src/types/input.rs b/lychee-lib/src/types/input.rs index 47aa3a1758..84c4d73acc 100644 --- a/lychee-lib/src/types/input.rs +++ b/lychee-lib/src/types/input.rs @@ -3,7 +3,6 @@ use crate::{utils, ErrorKind, Result}; use async_stream::try_stream; use futures::stream::Stream; use glob::glob_with; -use ignore::types::TypesBuilder; use ignore::WalkBuilder; use reqwest::Url; use serde::{Deserialize, Serialize}; @@ -13,6 +12,8 @@ use std::fs; use std::path::{Path, PathBuf}; use tokio::io::{stdin, AsyncReadExt}; +use super::file::FileExtensions; + const STDIN: &str = "-"; #[derive(Debug)] @@ -194,6 +195,9 @@ impl Input { /// Retrieve the contents from the input /// + /// If the input is a path, only search through files that match the given + /// file extensions. + /// /// # Errors /// /// Returns an error if the contents can not be retrieved @@ -204,7 +208,9 @@ impl Input { skip_missing: bool, skip_hidden: bool, skip_gitignored: bool, - extensions: Vec, + // If `Input` is a file path, try the given file extensions in order. + // Stop on the first match. + file_extensions: FileExtensions, ) -> impl Stream> { try_stream! { match self.source { @@ -227,15 +233,9 @@ impl Input { } InputSource::FsPath(ref path) => { if path.is_dir() { - - let mut types_builder = TypesBuilder::new(); - for ext in extensions { - types_builder.add(&ext, &format!("*.{ext}"))?; - } - for entry in WalkBuilder::new(path) .standard_filters(skip_gitignored) - .types(types_builder.select("all").build()?) + .types(file_extensions.all()?) .hidden(skip_hidden) .build() { diff --git a/lychee-lib/src/types/mod.rs b/lychee-lib/src/types/mod.rs index 5bb933fe0b..ab1c9e7c65 100644 --- a/lychee-lib/src/types/mod.rs +++ b/lychee-lib/src/types/mod.rs @@ -21,7 +21,7 @@ pub use basic_auth::{BasicAuthCredentials, BasicAuthSelector}; pub use cache::CacheStatus; pub use cookies::CookieJar; pub use error::ErrorKind; -pub use file::FileType; +pub use file::{FileExtensions, FileType}; pub use input::{Input, InputContent, InputSource}; pub use request::Request; pub use response::{Response, ResponseBody}; From 60895a0c4dbe9deca2eeaedf206ed291214f2acf Mon Sep 17 00:00:00 2001 From: Matthias Date: Wed, 26 Feb 2025 00:34:34 +0100 Subject: [PATCH 04/12] Refactor file extension parsing to use FromStr implementation for improved clarity --- lychee-bin/src/options.rs | 3 --- lychee-lib/src/types/file.rs | 8 ++++++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index 5a4deb8c4c..5e8a49f87b 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -238,9 +238,6 @@ pub(crate) struct Config { /// want to provide a long list of inputs (e.g. file1.html, file2.md, etc.) #[arg( long, - value_parser = |s: &str| -> Result { - Ok(FileExtensions::from(s.split(',').map(String::from).collect::>())) - }, long_help = "Test the specified file extensions for URIs when checking files locally. Multiple extensions can be separated by commas. Note that if you want to check filetypes, diff --git a/lychee-lib/src/types/file.rs b/lychee-lib/src/types/file.rs index 37d6c2ff93..f768f664f0 100644 --- a/lychee-lib/src/types/file.rs +++ b/lychee-lib/src/types/file.rs @@ -92,6 +92,14 @@ impl Iterator for FileExtensions { } } +impl std::str::FromStr for FileExtensions { + type Err = std::convert::Infallible; // Cannot fail parsing + + fn from_str(s: &str) -> Result { + Ok(Self(s.split(',').map(String::from).collect())) + } +} + #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, Deserialize, Serialize)] /// `FileType` defines which file types lychee can handle pub enum FileType { From 623f9aeffc47f53c794cc2a09d14870388d513dd Mon Sep 17 00:00:00 2001 From: Matthias Date: Wed, 26 Feb 2025 00:36:06 +0100 Subject: [PATCH 05/12] order derive and comment --- lychee-lib/src/types/file.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lychee-lib/src/types/file.rs b/lychee-lib/src/types/file.rs index f768f664f0..ea4ca511e4 100644 --- a/lychee-lib/src/types/file.rs +++ b/lychee-lib/src/types/file.rs @@ -100,8 +100,8 @@ impl std::str::FromStr for FileExtensions { } } -#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, Deserialize, Serialize)] /// `FileType` defines which file types lychee can handle +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, Deserialize, Serialize)] pub enum FileType { /// File in HTML format Html, From aa53a8053b1b1ac5bc32fa5199763a1ba63fda0b Mon Sep 17 00:00:00 2001 From: Matthias Date: Wed, 26 Feb 2025 00:39:29 +0100 Subject: [PATCH 06/12] typo --- lychee-lib/src/types/file.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lychee-lib/src/types/file.rs b/lychee-lib/src/types/file.rs index ea4ca511e4..bc66258acd 100644 --- a/lychee-lib/src/types/file.rs +++ b/lychee-lib/src/types/file.rs @@ -226,7 +226,7 @@ mod tests { assert!(extensions.contains("html")); assert!(extensions.contains("markdown")); assert!(extensions.contains("htm")); - // Test the count matches our static arrays + // Test that the count matches our static arrays let all_extensions: Vec<_> = extensions.into(); assert_eq!( all_extensions.len(), From 3f1613ad603ade60571d2b73a0cc19fe22b32092 Mon Sep 17 00:00:00 2001 From: Matthias Date: Wed, 26 Feb 2025 00:45:19 +0100 Subject: [PATCH 07/12] Add plaintext extension handling to FileType --- lychee-lib/src/types/file.rs | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/lychee-lib/src/types/file.rs b/lychee-lib/src/types/file.rs index bc66258acd..19aaabfe7d 100644 --- a/lychee-lib/src/types/file.rs +++ b/lychee-lib/src/types/file.rs @@ -73,7 +73,7 @@ impl From for FileExtensions { match file_type { FileType::Html => FileType::html_extensions(), FileType::Markdown => FileType::markdown_extensions(), - FileType::Plaintext => Self::empty(), + FileType::Plaintext => FileType::plaintext_extensions(), } } } @@ -120,12 +120,16 @@ impl FileType { /// All known HTML extensions const HTML_EXTENSIONS: &'static [&'static str] = &["htm", "html"]; - /// Default extensions which are supported by lychee + /// All known plaintext extensions + const PLAINTEXT_EXTENSIONS: &'static [&'static str] = &["txt"]; + + /// Default extensions which are checked by lychee #[must_use] pub fn default_extensions() -> FileExtensions { let mut extensions = FileExtensions::empty(); extensions.extend(Self::markdown_extensions()); extensions.extend(Self::html_extensions()); + extensions.extend(Self::plaintext_extensions()); extensions } @@ -147,6 +151,15 @@ impl FileType { .collect() } + /// All known plaintext extensions + #[must_use] + pub fn plaintext_extensions() -> FileExtensions { + Self::PLAINTEXT_EXTENSIONS + .iter() + .map(|&s| s.to_string()) + .collect() + } + /// Get the [`FileType`] from an extension string fn from_extension(ext: &str) -> Option { let ext = ext.to_lowercase(); From 657090b7a314f2f75d0c590e09f5bf13fe599e95 Mon Sep 17 00:00:00 2001 From: Matthias Date: Wed, 26 Feb 2025 00:49:29 +0100 Subject: [PATCH 08/12] Set default value for file extensions in Config and implement Display for FileExtensions --- lychee-bin/src/options.rs | 1 + lychee-lib/src/types/file.rs | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index 5e8a49f87b..c6e18ee287 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -238,6 +238,7 @@ pub(crate) struct Config { /// want to provide a long list of inputs (e.g. file1.html, file2.md, etc.) #[arg( long, + default_value_t = FileExtensions::default(), long_help = "Test the specified file extensions for URIs when checking files locally. Multiple extensions can be separated by commas. Note that if you want to check filetypes, diff --git a/lychee-lib/src/types/file.rs b/lychee-lib/src/types/file.rs index 19aaabfe7d..9ebbb71b20 100644 --- a/lychee-lib/src/types/file.rs +++ b/lychee-lib/src/types/file.rs @@ -20,7 +20,13 @@ pub struct FileExtensions(Vec); impl Default for FileExtensions { fn default() -> Self { - FileType::default().into() + FileType::default_extensions().into() + } +} + +impl std::fmt::Display for FileExtensions { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0.join(",")) } } From f498492356dd5c02f5b61f1e4641832a1d5b230d Mon Sep 17 00:00:00 2001 From: Matthias Date: Wed, 26 Feb 2025 00:50:40 +0100 Subject: [PATCH 09/12] cleanup --- lychee-lib/src/types/file.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lychee-lib/src/types/file.rs b/lychee-lib/src/types/file.rs index 9ebbb71b20..e5c19cfe8a 100644 --- a/lychee-lib/src/types/file.rs +++ b/lychee-lib/src/types/file.rs @@ -20,7 +20,7 @@ pub struct FileExtensions(Vec); impl Default for FileExtensions { fn default() -> Self { - FileType::default_extensions().into() + FileType::default_extensions() } } From e5cfd2fa2c74a68eb42cc284f281cd5a8f4780f0 Mon Sep 17 00:00:00 2001 From: Matthias Date: Wed, 26 Feb 2025 01:23:28 +0100 Subject: [PATCH 10/12] Add default file extensions to README documentation --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 492eca13f8..1a8dab88ef 100644 --- a/README.md +++ b/README.md @@ -341,6 +341,8 @@ Options: which have multiple extensions, e.g. HTML files with both .html and .htm extensions, you need to specify both extensions explicitly. + [default: md,mkd,mdx,mdown,mdwn,mkdn,mkdown,markdown,html,htm,txt] + --cache Use request cache stored on disk at `.lycheecache` From e475bbbef430dafdfc5582580405b28830ec10d5 Mon Sep 17 00:00:00 2001 From: Matthias Date: Sun, 2 Mar 2025 01:38:02 +0100 Subject: [PATCH 11/12] Implement TryFrom for FileExtensions and update input handling to use try_into --- lychee-lib/src/types/file.rs | 8 ++++++-- lychee-lib/src/types/input.rs | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/lychee-lib/src/types/file.rs b/lychee-lib/src/types/file.rs index e5c19cfe8a..ffd75c1f1f 100644 --- a/lychee-lib/src/types/file.rs +++ b/lychee-lib/src/types/file.rs @@ -46,6 +46,10 @@ impl FileExtensions { pub fn contains>(&self, file_extension: T) -> bool { self.0.contains(&file_extension.into()) } +} + +impl TryFrom for Types { + type Error = super::ErrorKind; /// Build the current list of file extensions into a file type matcher. /// @@ -53,9 +57,9 @@ impl FileExtensions { /// /// Fails if an extension is `all` or otherwise contains any character that /// is not a Unicode letter or number. - pub fn all(&self) -> super::Result { + fn try_from(value: FileExtensions) -> super::Result { let mut types_builder = TypesBuilder::new(); - for ext in self.0.clone() { + for ext in value.0.clone() { types_builder.add(&ext, &format!("*.{ext}"))?; } Ok(types_builder.select("all").build()?) diff --git a/lychee-lib/src/types/input.rs b/lychee-lib/src/types/input.rs index 84c4d73acc..effce2f1f0 100644 --- a/lychee-lib/src/types/input.rs +++ b/lychee-lib/src/types/input.rs @@ -235,7 +235,7 @@ impl Input { if path.is_dir() { for entry in WalkBuilder::new(path) .standard_filters(skip_gitignored) - .types(file_extensions.all()?) + .types(file_extensions.try_into()?) .hidden(skip_hidden) .build() { From ee4e70876598e0370efed15f14fed537714dfecd Mon Sep 17 00:00:00 2001 From: Matthias Date: Sun, 2 Mar 2025 01:49:59 +0100 Subject: [PATCH 12/12] Add plaintext extension handling to FileType and fix test --- lychee-lib/src/types/file.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lychee-lib/src/types/file.rs b/lychee-lib/src/types/file.rs index ffd75c1f1f..36fb8212d6 100644 --- a/lychee-lib/src/types/file.rs +++ b/lychee-lib/src/types/file.rs @@ -177,6 +177,8 @@ impl FileType { Some(Self::Markdown) } else if Self::HTML_EXTENSIONS.contains(&ext.as_str()) { Some(Self::Html) + } else if Self::PLAINTEXT_EXTENSIONS.contains(&ext.as_str()) { + Some(Self::Plaintext) } else { None } @@ -253,7 +255,9 @@ mod tests { let all_extensions: Vec<_> = extensions.into(); assert_eq!( all_extensions.len(), - FileType::MARKDOWN_EXTENSIONS.len() + FileType::HTML_EXTENSIONS.len() + FileType::MARKDOWN_EXTENSIONS.len() + + FileType::HTML_EXTENSIONS.len() + + FileType::PLAINTEXT_EXTENSIONS.len() ); }