From b6313e9810d3f00dc3eb0e871ea08c0168c0df0f Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Wed, 22 Oct 2025 17:42:09 +0200 Subject: [PATCH 01/10] Fix benches & remove TryFrom which was only used in bench --- Cargo.lock | 6 +++-- Makefile | 2 +- benches/Cargo.toml | 1 + benches/src/extract.rs | 34 +++++++++++++++++---------- lychee-lib/src/types/input/content.rs | 27 --------------------- 5 files changed, 27 insertions(+), 43 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 64ed30ff3e..c0af6ef8c6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -447,6 +447,7 @@ version = "0.0.0" dependencies = [ "criterion", "lychee-lib", + "tokio", ] [[package]] @@ -1746,12 +1747,13 @@ dependencies = [ [[package]] name = "half" -version = "2.6.0" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" dependencies = [ "cfg-if", "crunchy", + "zerocopy", ] [[package]] diff --git a/Makefile b/Makefile index 3c64f93354..e0221a5d90 100644 --- a/Makefile +++ b/Makefile @@ -56,7 +56,7 @@ test: ## Run tests .PHONY: bench bench: ## Run benchmarks - cargo bench + cargo bench --profile=dev .PHONY: doc doc: ## Open documentation diff --git a/benches/Cargo.toml b/benches/Cargo.toml index 103e60602f..1e41ceadf9 100644 --- a/benches/Cargo.toml +++ b/benches/Cargo.toml @@ -10,6 +10,7 @@ publish = false [dependencies] lychee-lib = { path = "../lychee-lib", default-features = false } criterion = "0.7.0" +tokio = "1.48.0" [features] email-check = ["lychee-lib/email-check"] diff --git a/benches/src/extract.rs b/benches/src/extract.rs index 0a18f887b9..82a15c021a 100644 --- a/benches/src/extract.rs +++ b/benches/src/extract.rs @@ -1,14 +1,12 @@ use criterion::{Criterion, criterion_group, criterion_main}; use lychee_lib::extract::Extractor; -use lychee_lib::{FileType, InputContent}; +use lychee_lib::{FileType, Input, InputContent}; use std::hint::black_box; -use std::path::PathBuf; -fn extract(paths: &[PathBuf]) { - for path in paths { - let content: InputContent = path.try_into().unwrap(); +fn extract(inputs: &Vec) { + for input in inputs { let extractor = Extractor::default(); - let extracted = extractor.extract(&content); + let extracted = extractor.extract(input); println!("{}", extracted.len()); } } @@ -36,14 +34,24 @@ fn benchmark_input_content_creation(c: &mut Criterion) { fn benchmark(c: &mut Criterion) { // Currently Wikipedia's biggest featured article - c.bench_function("extract from large docs", |b| { - b.iter(|| { - extract(black_box(&[ - PathBuf::from("../fixtures/bench/elvis.html"), - PathBuf::from("../fixtures/bench/arch.html"), - ])) - }) + let mut inputs = vec![]; + + let runtime = tokio::runtime::Builder::new_current_thread() + .build() + .unwrap(); + + runtime.block_on(async { + inputs = vec![ + Input::path_content("../fixtures/bench/elvis.html") + .await + .unwrap(), + Input::path_content("../fixtures/bench/arch.html") + .await + .unwrap(), + ]; }); + + c.bench_function("extract from large docs", |b| b.iter(|| extract(&inputs))); } criterion_group!( diff --git a/lychee-lib/src/types/input/content.rs b/lychee-lib/src/types/input/content.rs index 9a7dc1c3db..49dfca2be5 100644 --- a/lychee-lib/src/types/input/content.rs +++ b/lychee-lib/src/types/input/content.rs @@ -4,11 +4,8 @@ //! input sources, along with metadata about the source and file type. use super::source::ResolvedInputSource; -use crate::ErrorKind; use crate::types::FileType; use std::borrow::Cow; -use std::fs; -use std::path::PathBuf; /// Encapsulates the content for a given input #[derive(Debug)] @@ -43,27 +40,3 @@ impl InputContent { } } } - -impl TryFrom<&PathBuf> for InputContent { - type Error = crate::ErrorKind; - - fn try_from(path: &PathBuf) -> std::result::Result { - let input = match fs::read_to_string(path) { - Ok(content) => content, - Err(e) if e.kind() == std::io::ErrorKind::InvalidData => { - log::warn!( - "Skipping file with invalid UTF-8 content: {}", - path.display() - ); - return Err(ErrorKind::ReadFileInput(e, path.clone())); - } - Err(e) => return Err(ErrorKind::ReadFileInput(e, path.clone())), - }; - - Ok(Self { - source: ResolvedInputSource::String(Cow::Owned(input.clone())), - file_type: FileType::from(path), - content: input, - }) - } -} From d76817605d7e1b85cf16f0e0def51e04b919bb3b Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Fri, 24 Oct 2025 17:23:22 +0200 Subject: [PATCH 02/10] Basic concept working --- lychee-bin/src/main.rs | 3 +- lychee-bin/src/options.rs | 11 ++++++++ lychee-lib/src/collector.rs | 14 ++++++++- lychee-lib/src/types/error.rs | 8 +++++- lychee-lib/src/types/input/input.rs | 44 +++++++++++++++++++++++------ 5 files changed, 68 insertions(+), 12 deletions(-) diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index e43f02396c..3e8287d4b3 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -350,7 +350,8 @@ async fn run(opts: &LycheeOptions) -> Result { .excluded_paths(PathExcludes::new(opts.config.exclude_path.clone())?) // File a bug if you rely on this envvar! It's going to go away eventually. .use_html5ever(std::env::var("LYCHEE_USE_HTML5EVER").is_ok_and(|x| x == "1")) - .include_wikilinks(opts.config.include_wikilinks); + .include_wikilinks(opts.config.include_wikilinks) + .pre(opts.config.pre.clone()); collector = if let Some(ref basic_auth) = opts.config.basic_auth { collector.basic_auth_extractor(BasicAuthExtractor::new(basic_auth)?) diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index c03a8de9f4..ab94fadf98 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -853,6 +853,16 @@ and existing cookies will be updated." #[arg(long)] #[serde(default)] pub(crate) include_wikilinks: bool, + + /// Preprocess input files. + #[arg( + short, + long, + long_help = "Preprocess input files. +This allows files not recognized by lychee to be converted into a compatible format." + )] + #[serde(default)] + pub(crate) pre: Option, } impl Config { @@ -943,6 +953,7 @@ impl Config { no_progress: false, offline: false, output: None, + pre: None, remap: Vec::::new(), require_https: false, retry_wait_time: DEFAULT_RETRY_WAIT_TIME_SECS, diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 4e1ab15775..c7bf1f1341 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -36,6 +36,7 @@ pub struct Collector { excluded_paths: PathExcludes, headers: HeaderMap, client: Client, + pre: Option, } impl Default for Collector { @@ -59,6 +60,7 @@ impl Default for Collector { headers: HeaderMap::new(), client: Client::new(), excluded_paths: PathExcludes::empty(), + pre: None, } } } @@ -84,6 +86,7 @@ impl Collector { use_html5ever: false, skip_hidden: true, skip_ignored: true, + pre: None, headers: HeaderMap::new(), client: Client::builder() .build() @@ -143,14 +146,21 @@ impl Collector { self } - #[allow(clippy::doc_markdown)] /// Check WikiLinks in Markdown files + #[allow(clippy::doc_markdown)] #[must_use] pub const fn include_wikilinks(mut self, yes: bool) -> Self { self.include_wikilinks = yes; self } + /// Skip over links in verbatim sections (like Markdown code blocks) + #[must_use] + pub fn pre(mut self, pre: Option) -> Self { + self.pre = pre; + self + } + /// Pass a [`BasicAuthExtractor`] which is capable to match found /// URIs to basic auth credentials. These credentials get passed to the /// request in question. @@ -263,6 +273,7 @@ impl Collector { let extensions = extensions.clone(); let resolver = resolver.clone(); let excluded_paths = excluded_paths.clone(); + let pre = self.pre.clone(); async move { let base = match &input.source { @@ -278,6 +289,7 @@ impl Collector { extensions, resolver, excluded_paths, + pre, ) .map(move |content| (content, base.clone())) } diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index 3abe0e72ec..91df63515d 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -170,6 +170,10 @@ pub enum ErrorKind { #[error("Status code range error")] StatusCodeSelectorError(#[from] StatusCodeSelectorError), + /// Preprocessor command error + #[error("Preprocessor command '{0}' failed: {1}")] + PreprocessorError(String, String), + /// Test-only error variant for formatter tests /// Available in both test and debug builds to support cross-crate testing #[cfg(any(test, debug_assertions))] @@ -334,7 +338,8 @@ impl ErrorKind { [] => "No directory links are allowed because index_files is defined and empty".to_string(), [name] => format!("An index file ({name}) is required"), [init @ .., tail] => format!("An index file ({}, or {}) is required", init.join(", "), tail), - }.into() + }.into(), + ErrorKind::PreprocessorError(command, reason) => Some(format!("Command '{command}' failed {reason}. Check value of the pre option")) } } @@ -470,6 +475,7 @@ impl Hash for ErrorKind { Self::BasicAuthExtractorError(e) => e.to_string().hash(state), Self::Cookies(e) => e.hash(state), Self::StatusCodeSelectorError(e) => e.to_string().hash(state), + Self::PreprocessorError(c, e) => (c, e).hash(state), } } } diff --git a/lychee-lib/src/types/input/input.rs b/lychee-lib/src/types/input/input.rs index b1b51885c1..bf36ac54ca 100644 --- a/lychee-lib/src/types/input/input.rs +++ b/lychee-lib/src/types/input/input.rs @@ -19,6 +19,7 @@ use ignore::WalkBuilder; use reqwest::Url; use shellexpand::tilde; use std::path::{Path, PathBuf}; +use std::process::Command; use tokio::io::{AsyncReadExt, stdin}; const STDIN: &str = "-"; @@ -170,6 +171,7 @@ impl Input { file_extensions: FileExtensions, resolver: UrlContentResolver, excluded_paths: PathExcludes, + pre: Option, ) -> impl Stream> { try_stream! { // Handle simple cases that don't need resolution @@ -209,7 +211,7 @@ impl Input { Ok(source) => { let content_result = match source { ResolvedInputSource::FsPath(path) => { - Self::path_content(&path).await + Self::path_content(&path, &pre).await }, ResolvedInputSource::RemoteUrl(url) => { resolver.url_contents(*url).await @@ -353,20 +355,16 @@ impl Input { /// Returns an error if the file cannot be read pub async fn path_content + AsRef + Clone>( path: P, + pre: &Option, ) -> Result { let path = path.into(); + let content = Self::get_content(&path, pre).await?; - let content = tokio::fs::read_to_string(&path) - .await - .map_err(|e| ErrorKind::ReadFileInput(e, path.clone()))?; - - let input_content = InputContent { + Ok(InputContent { file_type: FileType::from(&path), source: ResolvedInputSource::FsPath(path), content, - }; - - Ok(input_content) + }) } /// Create `InputContent` from stdin. @@ -393,6 +391,34 @@ impl Input { pub fn string_content(s: &str, file_type_hint: Option) -> InputContent { InputContent::from_string(s, file_type_hint.unwrap_or_default()) } + + async fn get_content(path: &PathBuf, pre: &Option) -> Result { + if let Some(pre) = pre { + let output = Command::new(pre).arg(path).output().map_err(|e| { + ErrorKind::PreprocessorError(pre.clone(), format!("could not start: {e}")) + })?; + + if output.status.success() { + String::from_utf8(output.stdout).map_err(|e| ErrorKind::Utf8(e.utf8_error())) + } else { + let mut stderr = String::from_utf8(output.stderr) + .map_err(|e| ErrorKind::Utf8(e.utf8_error()))?; + + if stderr.is_empty() { + stderr = "".to_owned(); + } + + Err(ErrorKind::PreprocessorError( + pre.clone(), + format!("exited with non-zero code: {stderr}"), + )) + } + } else { + Ok(tokio::fs::read_to_string(path) + .await + .map_err(|e| ErrorKind::ReadFileInput(e, path.clone()))?) + } + } } impl TryFrom<&str> for Input { From 8f142e8cb63a35d4f29d589340bfe72d76b157df Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Mon, 27 Oct 2025 13:56:20 +0100 Subject: [PATCH 03/10] Test preprocessor option --- fixtures/pre/error_message.sh | 5 +++ fixtures/pre/no_error_message.sh | 3 ++ lychee-bin/tests/cli.rs | 67 +++++++++++++++++++++++++++++++- lychee-lib/src/collector.rs | 2 + 4 files changed, 76 insertions(+), 1 deletion(-) create mode 100755 fixtures/pre/error_message.sh create mode 100755 fixtures/pre/no_error_message.sh diff --git a/fixtures/pre/error_message.sh b/fixtures/pre/error_message.sh new file mode 100755 index 0000000000..0df66517af --- /dev/null +++ b/fixtures/pre/error_message.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash + +>&2 echo "Some error message" + +exit 1 diff --git a/fixtures/pre/no_error_message.sh b/fixtures/pre/no_error_message.sh new file mode 100755 index 0000000000..f019ff95ba --- /dev/null +++ b/fixtures/pre/no_error_message.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env bash + +exit 1 diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 6fc91300ac..ff705c02c5 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -61,7 +61,7 @@ mod cli { /// Assert actual output lines equals to expected lines. /// Order of the lines is ignored. fn assert_lines_eq + Ord>(result: Assert, mut expected_lines: Vec) { - let output = result.get_output().stdout.clone(); + let output = &result.get_output().stdout; let mut actual_lines: Vec = output .lines() .map(|line| line.unwrap().to_string()) @@ -3025,4 +3025,69 @@ mod cli { Ok(()) } + + /// Preprocessing with `cat` is like an identity function because it + /// outputs its input without any changes. + #[test] + fn test_pre_cat() { + let file = fixtures_path!().join("TEST.md"); + let pre_with_cat = main_command!() + .arg("--pre") + .arg("cat") + .arg("--dump") + .arg(&file) + .assert() + .success(); + + let no_pre = main_command!() + .arg("--dump") + .arg(&file) + .assert() + .success() + .get_output() + .stdout + .lines() + .map(|line| line.unwrap().to_string()) + .collect(); + + assert_lines_eq(pre_with_cat, no_pre); + } + + #[test] + fn test_pre_invalid_command() { + let file = fixtures_path!().join("TEST.md"); + main_command!() + .arg("--pre") + .arg("program does not exist") + .arg(file) + .assert() + .failure() + .stderr(contains("Error: Preprocessor command 'program does not exist' failed: could not start: No such file or directory (os error 2)")); + } + + #[test] + fn test_pre_error() { + let file = fixtures_path!().join("TEST.md"); + let script = fixtures_path!().join("pre").join("no_error_message.sh"); + main_command!() + .arg("--pre") + .arg(&script) + .arg(&file) + .assert() + .failure() + .stderr(contains(format!( + "Error: Preprocessor command '{}' failed: exited with non-zero code: ", script.as_os_str().to_str().unwrap() + ))); + + let script = fixtures_path!().join("pre").join("error_message.sh"); + main_command!() + .arg("--pre") + .arg(&script) + .arg(file) + .assert() + .failure() + .stderr(contains(format!( + "Error: Preprocessor command '{}' failed: exited with non-zero code: Some error message", script.as_os_str().to_str().unwrap() + ))); + } } diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index c7bf1f1341..7a50440ba5 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -378,6 +378,7 @@ mod tests { FileType::default_extensions(), UrlContentResolver::default(), PathExcludes::empty(), + None, ) .collect::>() .await; @@ -398,6 +399,7 @@ mod tests { FileType::default_extensions(), UrlContentResolver::default(), PathExcludes::empty(), + None, ) .collect::>() .await; From 33ce218ac728d53f735a031943c62bc47e790671 Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Tue, 28 Oct 2025 17:04:32 +0100 Subject: [PATCH 04/10] Extract Preprocessor type & add documentation --- benches/src/extract.rs | 4 +- lychee-bin/src/main.rs | 2 +- lychee-bin/src/options.rs | 29 +++++++++++-- lychee-lib/src/collector.rs | 17 ++++---- lychee-lib/src/lib.rs | 4 +- lychee-lib/src/types/input/input.rs | 34 ++++------------ lychee-lib/src/types/mod.rs | 2 + lychee-lib/src/types/preprocessor/mod.rs | 52 ++++++++++++++++++++++++ 8 files changed, 102 insertions(+), 42 deletions(-) create mode 100644 lychee-lib/src/types/preprocessor/mod.rs diff --git a/benches/src/extract.rs b/benches/src/extract.rs index 82a15c021a..d68ad1b65d 100644 --- a/benches/src/extract.rs +++ b/benches/src/extract.rs @@ -42,10 +42,10 @@ fn benchmark(c: &mut Criterion) { runtime.block_on(async { inputs = vec![ - Input::path_content("../fixtures/bench/elvis.html") + Input::path_content("../fixtures/bench/elvis.html", &None) .await .unwrap(), - Input::path_content("../fixtures/bench/arch.html") + Input::path_content("../fixtures/bench/arch.html", &None) .await .unwrap(), ]; diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index 3e8287d4b3..4473403e90 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -351,7 +351,7 @@ async fn run(opts: &LycheeOptions) -> Result { // File a bug if you rely on this envvar! It's going to go away eventually. .use_html5ever(std::env::var("LYCHEE_USE_HTML5EVER").is_ok_and(|x| x == "1")) .include_wikilinks(opts.config.include_wikilinks) - .pre(opts.config.pre.clone()); + .preprocessor(opts.config.pre.clone()); collector = if let Some(ref basic_auth) = opts.config.basic_auth { collector.basic_auth_extractor(BasicAuthExtractor::new(basic_auth)?) diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index ab94fadf98..7fd2e107f4 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -10,6 +10,7 @@ use http::{ HeaderMap, header::{HeaderName, HeaderValue}, }; +use lychee_lib::Preprocessor; use lychee_lib::{ Base, BasicAuthSelector, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES, DEFAULT_RETRY_WAIT_TIME_SECS, DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT, FileExtensions, @@ -858,11 +859,33 @@ and existing cookies will be updated." #[arg( short, long, - long_help = "Preprocess input files. -This allows files not recognized by lychee to be converted into a compatible format." + value_name = "COMMAND", + long_help = r#"Preprocess input files. +For each input file, this flag causes lychee to process the standard output of COMMAND PATH instead of the contents of PATH. +This allows you to convert files that would otherwise not be understood by lychee. +The preprocessor COMMAND is only run on input files, not on standard input or URLs. + +To invoke programs with custom arguments or to use multiple preprocessors use a wrapper program such as a shell script. +An example script looks like this: + +#!/usr/bin/env bash +case "$1" in +*.epub|*.odt|*.docx|*.ipynb) + exec pandoc "$1" --to=html --wrap=none --markdown-headings=atx + ;; +*.adoc|*.asciidoc) + asciidoctor -a stylesheet! "$1" -o - + ;; +*.pdf) + exec pdftotext "$1" - + ;; +*) + exec cat # identity function, output input without changes + ;; +esac"# )] #[serde(default)] - pub(crate) pre: Option, + pub(crate) pre: Option, } impl Config { diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 7a50440ba5..f5cbc95a22 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -1,5 +1,6 @@ use crate::ErrorKind; use crate::InputSource; +use crate::Preprocessor; use crate::filter::PathExcludes; use crate::types::resolver::UrlContentResolver; use crate::{ @@ -36,7 +37,7 @@ pub struct Collector { excluded_paths: PathExcludes, headers: HeaderMap, client: Client, - pre: Option, + preprocessor: Option, } impl Default for Collector { @@ -60,7 +61,7 @@ impl Default for Collector { headers: HeaderMap::new(), client: Client::new(), excluded_paths: PathExcludes::empty(), - pre: None, + preprocessor: None, } } } @@ -86,7 +87,7 @@ impl Collector { use_html5ever: false, skip_hidden: true, skip_ignored: true, - pre: None, + preprocessor: None, headers: HeaderMap::new(), client: Client::builder() .build() @@ -154,10 +155,10 @@ impl Collector { self } - /// Skip over links in verbatim sections (like Markdown code blocks) + /// Configure a file [`Preprocessor`] #[must_use] - pub fn pre(mut self, pre: Option) -> Self { - self.pre = pre; + pub fn preprocessor(mut self, preprocessor: Option) -> Self { + self.preprocessor = preprocessor; self } @@ -273,7 +274,7 @@ impl Collector { let extensions = extensions.clone(); let resolver = resolver.clone(); let excluded_paths = excluded_paths.clone(); - let pre = self.pre.clone(); + let preprocessor = self.preprocessor.clone(); async move { let base = match &input.source { @@ -289,7 +290,7 @@ impl Collector { extensions, resolver, excluded_paths, - pre, + preprocessor, ) .map(move |content| (content, base.clone())) } diff --git a/lychee-lib/src/lib.rs b/lychee-lib/src/lib.rs index 37dfd62ea9..c91cc65098 100644 --- a/lychee-lib/src/lib.rs +++ b/lychee-lib/src/lib.rs @@ -95,7 +95,7 @@ pub use crate::{ types::{ AcceptRange, AcceptRangeError, Base, BasicAuthCredentials, BasicAuthSelector, CacheStatus, CookieJar, ErrorKind, FileExtensions, FileType, Input, InputContent, InputResolver, - InputSource, Redirects, Request, ResolvedInputSource, Response, ResponseBody, Result, - Status, StatusCodeExcluder, StatusCodeSelector, uri::valid::Uri, + InputSource, Preprocessor, Redirects, Request, ResolvedInputSource, Response, ResponseBody, + Result, Status, StatusCodeExcluder, StatusCodeSelector, uri::valid::Uri, }, }; diff --git a/lychee-lib/src/types/input/input.rs b/lychee-lib/src/types/input/input.rs index bf36ac54ca..095ca4e526 100644 --- a/lychee-lib/src/types/input/input.rs +++ b/lychee-lib/src/types/input/input.rs @@ -7,6 +7,7 @@ use super::InputResolver; use super::content::InputContent; use super::source::InputSource; use super::source::ResolvedInputSource; +use crate::Preprocessor; use crate::filter::PathExcludes; use crate::types::FileType; use crate::types::file::FileExtensions; @@ -19,7 +20,6 @@ use ignore::WalkBuilder; use reqwest::Url; use shellexpand::tilde; use std::path::{Path, PathBuf}; -use std::process::Command; use tokio::io::{AsyncReadExt, stdin}; const STDIN: &str = "-"; @@ -171,7 +171,7 @@ impl Input { file_extensions: FileExtensions, resolver: UrlContentResolver, excluded_paths: PathExcludes, - pre: Option, + preprocessor: Option, ) -> impl Stream> { try_stream! { // Handle simple cases that don't need resolution @@ -211,7 +211,7 @@ impl Input { Ok(source) => { let content_result = match source { ResolvedInputSource::FsPath(path) => { - Self::path_content(&path, &pre).await + Self::path_content(&path, &preprocessor).await }, ResolvedInputSource::RemoteUrl(url) => { resolver.url_contents(*url).await @@ -355,10 +355,10 @@ impl Input { /// Returns an error if the file cannot be read pub async fn path_content + AsRef + Clone>( path: P, - pre: &Option, + preprocessor: &Option, ) -> Result { let path = path.into(); - let content = Self::get_content(&path, pre).await?; + let content = Self::get_content(&path, preprocessor).await?; Ok(InputContent { file_type: FileType::from(&path), @@ -392,27 +392,9 @@ impl Input { InputContent::from_string(s, file_type_hint.unwrap_or_default()) } - async fn get_content(path: &PathBuf, pre: &Option) -> Result { - if let Some(pre) = pre { - let output = Command::new(pre).arg(path).output().map_err(|e| { - ErrorKind::PreprocessorError(pre.clone(), format!("could not start: {e}")) - })?; - - if output.status.success() { - String::from_utf8(output.stdout).map_err(|e| ErrorKind::Utf8(e.utf8_error())) - } else { - let mut stderr = String::from_utf8(output.stderr) - .map_err(|e| ErrorKind::Utf8(e.utf8_error()))?; - - if stderr.is_empty() { - stderr = "".to_owned(); - } - - Err(ErrorKind::PreprocessorError( - pre.clone(), - format!("exited with non-zero code: {stderr}"), - )) - } + async fn get_content(path: &PathBuf, preprocessor: &Option) -> Result { + if let Some(pre) = preprocessor { + pre.process(path) } else { Ok(tokio::fs::read_to_string(path) .await diff --git a/lychee-lib/src/types/mod.rs b/lychee-lib/src/types/mod.rs index 679b26f972..da10e0aad6 100644 --- a/lychee-lib/src/types/mod.rs +++ b/lychee-lib/src/types/mod.rs @@ -9,6 +9,7 @@ mod error; mod file; mod input; pub(crate) mod mail; +mod preprocessor; pub(crate) mod redirect_history; mod request; pub(crate) mod resolver; @@ -25,6 +26,7 @@ pub use cookies::CookieJar; pub use error::ErrorKind; pub use file::{FileExtensions, FileType}; pub use input::{Input, InputContent, InputResolver, InputSource, ResolvedInputSource}; +pub use preprocessor::Preprocessor; pub use redirect_history::Redirects; pub use request::Request; pub use response::{Response, ResponseBody}; diff --git a/lychee-lib/src/types/preprocessor/mod.rs b/lychee-lib/src/types/preprocessor/mod.rs new file mode 100644 index 0000000000..8e0dc1b591 --- /dev/null +++ b/lychee-lib/src/types/preprocessor/mod.rs @@ -0,0 +1,52 @@ +use std::{path::PathBuf, process::Command}; + +use serde::Deserialize; + +use super::{ErrorKind, Result}; + +/// Preprocess files with the specified command. +/// So instead of reading the file contents directly, +/// lychee will read the output of the preprocessor command. +/// The specified command is invoked with one argument, the path to the input file. +/// +/// For example using `cat` is equivalent to not specifying any preprocessor command. +/// To invoke programs with custom arguments, +/// create a shell script to specify it as preprocessor command. +#[derive(Debug, Clone, Deserialize, PartialEq, Eq)] +pub struct Preprocessor(String); + +impl From for Preprocessor { + fn from(s: String) -> Self { + Self(s) + } +} + +impl Preprocessor { + /// Try to invoke the preprocessor command with `path` as single argument + /// and return the resulting stdout. + pub(crate) fn process(&self, path: &PathBuf) -> Result { + let pre = &self.0; + let output = Command::new(pre).arg(path).output().map_err(|e| { + ErrorKind::PreprocessorError(pre.clone(), format!("could not start: {e}")) + })?; + + if output.status.success() { + from_utf8(output.stdout) + } else { + let mut stderr = from_utf8(output.stderr)?; + + if stderr.is_empty() { + stderr = "".to_owned(); + } + + Err(ErrorKind::PreprocessorError( + pre.clone(), + format!("exited with non-zero code: {stderr}"), + )) + } + } +} + +fn from_utf8(data: Vec) -> Result { + String::from_utf8(data).map_err(|e| ErrorKind::Utf8(e.utf8_error())) +} From e21dd48326ff9ec0e425b3b32160479e8a09c33e Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Sat, 1 Nov 2025 11:40:07 +0100 Subject: [PATCH 05/10] Apply clippy suggestions --- README.md | 25 ++++++++++++++ benches/src/extract.rs | 4 +-- lychee-bin/tests/cli.rs | 6 +--- lychee-lib/src/collector.rs | 1 + lychee-lib/src/types/input/input.rs | 41 ++++++++--------------- lychee-lib/src/types/input/resolver.rs | 42 +++++++++++++++--------- lychee-lib/src/types/preprocessor/mod.rs | 2 +- 7 files changed, 70 insertions(+), 51 deletions(-) diff --git a/README.md b/README.md index f1489576f0..be5a6c7bae 100644 --- a/README.md +++ b/README.md @@ -609,6 +609,31 @@ Options: --offline Only check local files and block network requests + -p, --pre + Preprocess input files. + For each input file, this flag causes lychee to process the standard output of COMMAND PATH instead of the contents of PATH. + This allows you to convert files that would otherwise not be understood by lychee. + The preprocessor COMMAND is only run on input files, not on standard input or URLs. + + To invoke programs with custom arguments or to use multiple preprocessors use a wrapper program such as a shell script. + An example script looks like this: + + #!/usr/bin/env bash + case "$1" in + *.epub|*.odt|*.docx|*.ipynb) + exec pandoc "$1" --to=html --wrap=none --markdown-headings=atx + ;; + *.adoc|*.asciidoc) + asciidoctor -a stylesheet! "$1" -o - + ;; + *.pdf) + exec pdftotext "$1" - + ;; + *) + exec cat # identity function, output input without changes + ;; + esac + -q, --quiet... Less output per occurrence (e.g. `-q` or `-qq`) diff --git a/benches/src/extract.rs b/benches/src/extract.rs index d68ad1b65d..c29f24250e 100644 --- a/benches/src/extract.rs +++ b/benches/src/extract.rs @@ -42,10 +42,10 @@ fn benchmark(c: &mut Criterion) { runtime.block_on(async { inputs = vec![ - Input::path_content("../fixtures/bench/elvis.html", &None) + Input::path_content("../fixtures/bench/elvis.html", None) .await .unwrap(), - Input::path_content("../fixtures/bench/arch.html", &None) + Input::path_content("../fixtures/bench/arch.html", None) .await .unwrap(), ]; diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index ff705c02c5..cad79b7761 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -1140,11 +1140,7 @@ mod cli { // Clean up fs::remove_file(&cache_file).map_err(|e| { - anyhow::anyhow!( - "Failed to remove cache file: {:?}, error: {}", - cache_file, - e - ) + anyhow::anyhow!("Failed to remove cache file: {cache_file:?}, error: {e}") })?; Ok(()) diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index f5cbc95a22..4210dec1c9 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -2,6 +2,7 @@ use crate::ErrorKind; use crate::InputSource; use crate::Preprocessor; use crate::filter::PathExcludes; + use crate::types::resolver::UrlContentResolver; use crate::{ Base, Input, InputResolver, Request, Result, basic_auth::BasicAuthExtractor, diff --git a/lychee-lib/src/types/input/input.rs b/lychee-lib/src/types/input/input.rs index 095ca4e526..8e5c9fcfb9 100644 --- a/lychee-lib/src/types/input/input.rs +++ b/lychee-lib/src/types/input/input.rs @@ -5,18 +5,14 @@ use super::InputResolver; use super::content::InputContent; -use super::source::InputSource; -use super::source::ResolvedInputSource; +use super::source::{InputSource, ResolvedInputSource}; use crate::Preprocessor; use crate::filter::PathExcludes; -use crate::types::FileType; -use crate::types::file::FileExtensions; -use crate::types::resolver::UrlContentResolver; +use crate::types::{FileType, file::FileExtensions, resolver::UrlContentResolver}; use crate::{ErrorKind, Result}; use async_stream::try_stream; use futures::stream::{Stream, StreamExt}; use glob::glob_with; -use ignore::WalkBuilder; use reqwest::Url; use shellexpand::tilde; use std::path::{Path, PathBuf}; @@ -163,6 +159,10 @@ impl Input { /// Returns an error if the contents can not be retrieved because of an /// underlying I/O error (e.g. an error while making a network request or /// retrieving the contents from the file system) + #[allow( + clippy::too_many_arguments, + reason = "https://github.com/lycheeverse/lychee/issues/1898" + )] pub fn get_contents( self, skip_missing: bool, @@ -211,7 +211,7 @@ impl Input { Ok(source) => { let content_result = match source { ResolvedInputSource::FsPath(path) => { - Self::path_content(&path, &preprocessor).await + Self::path_content(&path, preprocessor.as_ref()).await }, ResolvedInputSource::RemoteUrl(url) => { resolver.url_contents(*url).await @@ -249,24 +249,6 @@ impl Input { } } - /// Create a `WalkBuilder` for directory traversal - fn walk_entries( - path: &Path, - file_extensions: FileExtensions, - skip_hidden: bool, - skip_gitignored: bool, - ) -> Result { - Ok(WalkBuilder::new(path) - // Enable standard filters if `skip_gitignored `is true. - // This will skip files ignored by `.gitignore` and other VCS ignore files. - .standard_filters(skip_gitignored) - // Override hidden file behavior to be controlled by the separate skip_hidden parameter - .hidden(skip_hidden) - // Configure the file types filter to only include files with matching extensions - .types(file_extensions.try_into()?) - .build()) - } - /// Retrieve all sources from this input. The output depends on the type of /// input: /// @@ -319,7 +301,7 @@ impl Input { } InputSource::FsPath(ref path) => { if path.is_dir() { - for entry in Input::walk_entries( + for entry in InputResolver::walk( path, file_extensions, skip_hidden, @@ -353,9 +335,10 @@ impl Input { /// # Errors /// /// Returns an error if the file cannot be read + /// or [`Preprocessor`] failed pub async fn path_content + AsRef + Clone>( path: P, - preprocessor: &Option, + preprocessor: Option<&Preprocessor>, ) -> Result { let path = path.into(); let content = Self::get_content(&path, preprocessor).await?; @@ -392,7 +375,9 @@ impl Input { InputContent::from_string(s, file_type_hint.unwrap_or_default()) } - async fn get_content(path: &PathBuf, preprocessor: &Option) -> Result { + /// Get content of file. + /// Get preprocessed file content if [`Preprocessor`] is [`Some`] + async fn get_content(path: &PathBuf, preprocessor: Option<&Preprocessor>) -> Result { if let Some(pre) = preprocessor { pre.process(path) } else { diff --git a/lychee-lib/src/types/input/resolver.rs b/lychee-lib/src/types/input/resolver.rs index 2abc52b94b..6bb5f79ffa 100644 --- a/lychee-lib/src/types/input/resolver.rs +++ b/lychee-lib/src/types/input/resolver.rs @@ -3,6 +3,8 @@ //! Provides the `InputResolver` which handles resolution of various input sources //! into concrete, processable sources by expanding glob patterns and applying filters. +use std::path::Path; + use super::input::Input; use super::source::{InputSource, ResolvedInputSource}; use crate::Result; @@ -11,7 +13,7 @@ use crate::types::file::FileExtensions; use async_stream::try_stream; use futures::stream::Stream; use glob::glob_with; -use ignore::WalkBuilder; +use ignore::{Walk, WalkBuilder}; use shellexpand::tilde; /// Resolves input sources into concrete, processable sources. @@ -52,6 +54,29 @@ impl InputResolver { ) } + /// Create a [`Walk`] iterator for directory traversal + /// + /// # Errors + /// + /// Fails if [`FileExtensions`] cannot be converted + pub(crate) fn walk( + path: &Path, + file_extensions: FileExtensions, + skip_hidden: bool, + skip_gitignored: bool, + ) -> Result { + Ok(WalkBuilder::new(path) + .git_ignore(skip_gitignored) + .git_global(skip_gitignored) + .git_exclude(skip_gitignored) + .ignore(skip_gitignored) + .parents(skip_gitignored) + .hidden(skip_hidden) + // Configure the file types filter to only include files with matching extensions + .types(file_extensions.try_into()?) + .build()) + } + /// Internal method for resolving input sources. /// /// Takes an Input and returns a stream of `ResolvedInputSource` items, @@ -103,20 +128,7 @@ impl InputResolver { }, InputSource::FsPath(path) => { if path.is_dir() { - let walk = WalkBuilder::new(path) - // Enable standard filters if `skip_gitignored `is - // true. This will skip files ignored by - // `.gitignore` and other VCS ignore files. - .standard_filters(skip_gitignored) - // Override hidden file behavior to be controlled by - // the separate skip_hidden parameter - .hidden(skip_hidden) - // Configure the file types filter to only include - // files with matching extensions - .types(file_extensions.try_into()?) - .build(); - - for entry in walk { + for entry in Self::walk(path, file_extensions, skip_hidden, skip_gitignored)? { let entry = entry?; if excluded_paths.is_match(&entry.path().to_string_lossy()) { continue; diff --git a/lychee-lib/src/types/preprocessor/mod.rs b/lychee-lib/src/types/preprocessor/mod.rs index 8e0dc1b591..ccac04a58e 100644 --- a/lychee-lib/src/types/preprocessor/mod.rs +++ b/lychee-lib/src/types/preprocessor/mod.rs @@ -36,7 +36,7 @@ impl Preprocessor { let mut stderr = from_utf8(output.stderr)?; if stderr.is_empty() { - stderr = "".to_owned(); + stderr = "".into(); } Err(ErrorKind::PreprocessorError( From 39cd56b7576171003ce9fe257946b88393f82644 Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Fri, 31 Oct 2025 17:57:32 +0100 Subject: [PATCH 06/10] Update help message --- README.md | 11 ++++++----- lychee-bin/src/options.rs | 11 ++++++----- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index be5a6c7bae..5c3fe93481 100644 --- a/README.md +++ b/README.md @@ -611,12 +611,13 @@ Options: -p, --pre Preprocess input files. - For each input file, this flag causes lychee to process the standard output of COMMAND PATH instead of the contents of PATH. - This allows you to convert files that would otherwise not be understood by lychee. - The preprocessor COMMAND is only run on input files, not on standard input or URLs. + For each file input, this flag causes lychee to execute `COMMAND PATH` and process + its standard output instead of the original contents of PATH. This allows you to + convert files that would otherwise not be understood by lychee. The preprocessor + COMMAND is only run on input files, not on standard input or URLs. - To invoke programs with custom arguments or to use multiple preprocessors use a wrapper program such as a shell script. - An example script looks like this: + To invoke programs with custom arguments or to use multiple preprocessors use a + wrapper program such as a shell script. An example script looks like this: #!/usr/bin/env bash case "$1" in diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index 7fd2e107f4..ee5c0bf026 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -861,12 +861,13 @@ and existing cookies will be updated." long, value_name = "COMMAND", long_help = r#"Preprocess input files. -For each input file, this flag causes lychee to process the standard output of COMMAND PATH instead of the contents of PATH. -This allows you to convert files that would otherwise not be understood by lychee. -The preprocessor COMMAND is only run on input files, not on standard input or URLs. +For each file input, this flag causes lychee to execute `COMMAND PATH` and process +its standard output instead of the original contents of PATH. This allows you to +convert files that would otherwise not be understood by lychee. The preprocessor +COMMAND is only run on input files, not on standard input or URLs. -To invoke programs with custom arguments or to use multiple preprocessors use a wrapper program such as a shell script. -An example script looks like this: +To invoke programs with custom arguments or to use multiple preprocessors use a +wrapper program such as a shell script. An example script looks like this: #!/usr/bin/env bash case "$1" in From 71ab3bc9a750bdf1db84a9d0464f2eef3ecccb4f Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Mon, 3 Nov 2025 08:39:52 +0100 Subject: [PATCH 07/10] Update lychee-bin/src/options.rs Co-authored-by: Matthias Endler --- lychee-bin/src/options.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index ee5c0bf026..5d578f5c31 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -866,7 +866,7 @@ its standard output instead of the original contents of PATH. This allows you to convert files that would otherwise not be understood by lychee. The preprocessor COMMAND is only run on input files, not on standard input or URLs. -To invoke programs with custom arguments or to use multiple preprocessors use a +To invoke programs with custom arguments or to use multiple preprocessors, use a wrapper program such as a shell script. An example script looks like this: #!/usr/bin/env bash From 5406d15ec08bb0ad963bec6280c5bbc38e43f370 Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Mon, 3 Nov 2025 09:16:32 +0100 Subject: [PATCH 08/10] Apply review suggestions --- README.md | 4 ++-- lychee-bin/src/commands/dump_inputs.rs | 4 ++-- lychee-bin/src/main.rs | 2 +- lychee-bin/src/options.rs | 4 ++-- lychee-bin/tests/cli.rs | 8 +++---- lychee-lib/src/types/error.rs | 13 ++++++++---- lychee-lib/src/types/input/input.rs | 8 +++---- lychee-lib/src/types/input/resolver.rs | 22 ++++++++++--------- lychee-lib/src/types/preprocessor/mod.rs | 27 ++++++++++++++---------- 9 files changed, 52 insertions(+), 40 deletions(-) diff --git a/README.md b/README.md index 5c3fe93481..173bec4333 100644 --- a/README.md +++ b/README.md @@ -609,14 +609,14 @@ Options: --offline Only check local files and block network requests - -p, --pre + -p, --preprocess Preprocess input files. For each file input, this flag causes lychee to execute `COMMAND PATH` and process its standard output instead of the original contents of PATH. This allows you to convert files that would otherwise not be understood by lychee. The preprocessor COMMAND is only run on input files, not on standard input or URLs. - To invoke programs with custom arguments or to use multiple preprocessors use a + To invoke programs with custom arguments or to use multiple preprocessors, use a wrapper program such as a shell script. An example script looks like this: #!/usr/bin/env bash diff --git a/lychee-bin/src/commands/dump_inputs.rs b/lychee-bin/src/commands/dump_inputs.rs index ca1db81bd2..a32e050010 100644 --- a/lychee-bin/src/commands/dump_inputs.rs +++ b/lychee-bin/src/commands/dump_inputs.rs @@ -18,7 +18,7 @@ pub(crate) async fn dump_inputs( excluded_paths: &[String], file_extensions: &FileExtensions, skip_hidden: bool, - skip_gitignored: bool, + skip_ignored: bool, ) -> Result { if let Some(out_file) = output { fs::File::create(out_file)?; @@ -36,7 +36,7 @@ pub(crate) async fn dump_inputs( let sources_stream = input.get_sources( file_extensions.clone(), skip_hidden, - skip_gitignored, + skip_ignored, &excluded_path_filter, ); tokio::pin!(sources_stream); diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index 4473403e90..00493408ba 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -351,7 +351,7 @@ async fn run(opts: &LycheeOptions) -> Result { // File a bug if you rely on this envvar! It's going to go away eventually. .use_html5ever(std::env::var("LYCHEE_USE_HTML5EVER").is_ok_and(|x| x == "1")) .include_wikilinks(opts.config.include_wikilinks) - .preprocessor(opts.config.pre.clone()); + .preprocessor(opts.config.preprocess.clone()); collector = if let Some(ref basic_auth) = opts.config.basic_auth { collector.basic_auth_extractor(BasicAuthExtractor::new(basic_auth)?) diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index 5d578f5c31..0561edc9b0 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -886,7 +886,7 @@ case "$1" in esac"# )] #[serde(default)] - pub(crate) pre: Option, + pub(crate) preprocess: Option, } impl Config { @@ -977,7 +977,7 @@ impl Config { no_progress: false, offline: false, output: None, - pre: None, + preprocess: None, remap: Vec::::new(), require_https: false, retry_wait_time: DEFAULT_RETRY_WAIT_TIME_SECS, diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index cad79b7761..5ea253be14 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -3028,7 +3028,7 @@ mod cli { fn test_pre_cat() { let file = fixtures_path!().join("TEST.md"); let pre_with_cat = main_command!() - .arg("--pre") + .arg("--preprocess") .arg("cat") .arg("--dump") .arg(&file) @@ -3053,7 +3053,7 @@ mod cli { fn test_pre_invalid_command() { let file = fixtures_path!().join("TEST.md"); main_command!() - .arg("--pre") + .arg("--preprocess") .arg("program does not exist") .arg(file) .assert() @@ -3066,7 +3066,7 @@ mod cli { let file = fixtures_path!().join("TEST.md"); let script = fixtures_path!().join("pre").join("no_error_message.sh"); main_command!() - .arg("--pre") + .arg("--preprocess") .arg(&script) .arg(&file) .assert() @@ -3077,7 +3077,7 @@ mod cli { let script = fixtures_path!().join("pre").join("error_message.sh"); main_command!() - .arg("--pre") + .arg("--preprocess") .arg(&script) .arg(file) .assert() diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index 91df63515d..1efab972f3 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -171,8 +171,13 @@ pub enum ErrorKind { StatusCodeSelectorError(#[from] StatusCodeSelectorError), /// Preprocessor command error - #[error("Preprocessor command '{0}' failed: {1}")] - PreprocessorError(String, String), + #[error("Preprocessor command '{command}' failed: {reason}")] + PreprocessorError { + /// The command which did not execute successfully + command: String, + /// The reason the command failed + reason: String, + }, /// Test-only error variant for formatter tests /// Available in both test and debug builds to support cross-crate testing @@ -339,7 +344,7 @@ impl ErrorKind { [name] => format!("An index file ({name}) is required"), [init @ .., tail] => format!("An index file ({}, or {}) is required", init.join(", "), tail), }.into(), - ErrorKind::PreprocessorError(command, reason) => Some(format!("Command '{command}' failed {reason}. Check value of the pre option")) + ErrorKind::PreprocessorError{command, reason} => Some(format!("Command '{command}' failed {reason}. Check value of the pre option")) } } @@ -475,7 +480,7 @@ impl Hash for ErrorKind { Self::BasicAuthExtractorError(e) => e.to_string().hash(state), Self::Cookies(e) => e.hash(state), Self::StatusCodeSelectorError(e) => e.to_string().hash(state), - Self::PreprocessorError(c, e) => (c, e).hash(state), + Self::PreprocessorError { command, reason } => (command, reason).hash(state), } } } diff --git a/lychee-lib/src/types/input/input.rs b/lychee-lib/src/types/input/input.rs index 8e5c9fcfb9..1bcc372c2e 100644 --- a/lychee-lib/src/types/input/input.rs +++ b/lychee-lib/src/types/input/input.rs @@ -167,7 +167,7 @@ impl Input { self, skip_missing: bool, skip_hidden: bool, - skip_gitignored: bool, + skip_ignored: bool, file_extensions: FileExtensions, resolver: UrlContentResolver, excluded_paths: PathExcludes, @@ -200,7 +200,7 @@ impl Input { &self, file_extensions, skip_hidden, - skip_gitignored, + skip_ignored, &excluded_paths, )); @@ -275,7 +275,7 @@ impl Input { self, file_extensions: FileExtensions, skip_hidden: bool, - skip_gitignored: bool, + skip_ignored: bool, excluded_paths: &PathExcludes, ) -> impl Stream> { try_stream! { @@ -305,7 +305,7 @@ impl Input { path, file_extensions, skip_hidden, - skip_gitignored, + skip_ignored, )? { let entry = entry?; if !Self::is_excluded_path(entry.path(), excluded_paths) { diff --git a/lychee-lib/src/types/input/resolver.rs b/lychee-lib/src/types/input/resolver.rs index 6bb5f79ffa..89e95fb276 100644 --- a/lychee-lib/src/types/input/resolver.rs +++ b/lychee-lib/src/types/input/resolver.rs @@ -42,14 +42,14 @@ impl InputResolver { input: &'a Input, file_extensions: FileExtensions, skip_hidden: bool, - skip_gitignored: bool, + skip_ignored: bool, excluded_paths: &'a PathExcludes, ) -> impl Stream> + 'a { Self::resolve_input( input, file_extensions, skip_hidden, - skip_gitignored, + skip_ignored, excluded_paths, ) } @@ -63,14 +63,16 @@ impl InputResolver { path: &Path, file_extensions: FileExtensions, skip_hidden: bool, - skip_gitignored: bool, + skip_ignored: bool, ) -> Result { Ok(WalkBuilder::new(path) - .git_ignore(skip_gitignored) - .git_global(skip_gitignored) - .git_exclude(skip_gitignored) - .ignore(skip_gitignored) - .parents(skip_gitignored) + // Skip over files which are ignored by git or `.ignore` if necessary + .git_ignore(skip_ignored) + .git_global(skip_ignored) + .git_exclude(skip_ignored) + .ignore(skip_ignored) + .parents(skip_ignored) + // Ignore hidden files if necessary .hidden(skip_hidden) // Configure the file types filter to only include files with matching extensions .types(file_extensions.try_into()?) @@ -86,7 +88,7 @@ impl InputResolver { input: &'a Input, file_extensions: FileExtensions, skip_hidden: bool, - skip_gitignored: bool, + skip_ignored: bool, excluded_paths: &'a PathExcludes, ) -> impl Stream> + 'a { try_stream! { @@ -128,7 +130,7 @@ impl InputResolver { }, InputSource::FsPath(path) => { if path.is_dir() { - for entry in Self::walk(path, file_extensions, skip_hidden, skip_gitignored)? { + for entry in Self::walk(path, file_extensions, skip_hidden, skip_ignored)? { let entry = entry?; if excluded_paths.is_match(&entry.path().to_string_lossy()) { continue; diff --git a/lychee-lib/src/types/preprocessor/mod.rs b/lychee-lib/src/types/preprocessor/mod.rs index ccac04a58e..0999b15332 100644 --- a/lychee-lib/src/types/preprocessor/mod.rs +++ b/lychee-lib/src/types/preprocessor/mod.rs @@ -13,11 +13,13 @@ use super::{ErrorKind, Result}; /// To invoke programs with custom arguments, /// create a shell script to specify it as preprocessor command. #[derive(Debug, Clone, Deserialize, PartialEq, Eq)] -pub struct Preprocessor(String); +pub struct Preprocessor { + command: String, +} impl From for Preprocessor { - fn from(s: String) -> Self { - Self(s) + fn from(command: String) -> Self { + Self { command } } } @@ -25,10 +27,13 @@ impl Preprocessor { /// Try to invoke the preprocessor command with `path` as single argument /// and return the resulting stdout. pub(crate) fn process(&self, path: &PathBuf) -> Result { - let pre = &self.0; - let output = Command::new(pre).arg(path).output().map_err(|e| { - ErrorKind::PreprocessorError(pre.clone(), format!("could not start: {e}")) - })?; + let output = Command::new(&self.command) + .arg(path) + .output() + .map_err(|e| ErrorKind::PreprocessorError { + command: self.command.clone(), + reason: format!("could not start: {e}"), + })?; if output.status.success() { from_utf8(output.stdout) @@ -39,10 +44,10 @@ impl Preprocessor { stderr = "".into(); } - Err(ErrorKind::PreprocessorError( - pre.clone(), - format!("exited with non-zero code: {stderr}"), - )) + Err(ErrorKind::PreprocessorError { + command: self.command.clone(), + reason: format!("exited with non-zero code: {stderr}"), + }) } } } From dc800ea3cb6f6250bda62721a6d3a44ff13105bd Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Mon, 3 Nov 2025 10:23:55 +0100 Subject: [PATCH 09/10] Remove ErrorKind::TestError to fix compilation errors for release profile --- Makefile | 2 +- lychee-bin/src/formatters/response/color.rs | 6 +++--- lychee-bin/src/formatters/response/emoji.rs | 6 +++--- lychee-bin/src/formatters/response/plain.rs | 4 ++-- lychee-bin/src/formatters/response/task.rs | 4 ++-- lychee-bin/src/formatters/stats/mod.rs | 2 +- lychee-lib/src/types/error.rs | 14 -------------- lychee-lib/src/utils/reqwest.rs | 4 ++-- 8 files changed, 14 insertions(+), 28 deletions(-) diff --git a/Makefile b/Makefile index e0221a5d90..3c64f93354 100644 --- a/Makefile +++ b/Makefile @@ -56,7 +56,7 @@ test: ## Run tests .PHONY: bench bench: ## Run benchmarks - cargo bench --profile=dev + cargo bench .PHONY: doc doc: ## Open documentation diff --git a/lychee-bin/src/formatters/response/color.rs b/lychee-bin/src/formatters/response/color.rs index baf4ef65d9..38180f63be 100644 --- a/lychee-bin/src/formatters/response/color.rs +++ b/lychee-bin/src/formatters/response/color.rs @@ -97,7 +97,7 @@ mod tests { fn test_format_response_with_error_status() { let formatter = ColorFormatter; let body = mock_response_body!( - Status::Error(ErrorKind::TestError), + Status::Error(ErrorKind::EmptyUrl), "https://example.com/404", ); let formatted_response = strip_ansi_codes(&formatter.format_response(&body)); @@ -118,14 +118,14 @@ mod tests { fn test_detailed_response_output() { let formatter = ColorFormatter; let body = mock_response_body!( - Status::Error(ErrorKind::TestError), + Status::Error(ErrorKind::EmptyUrl), "https://example.com/404", ); let response = strip_ansi_codes(&formatter.format_detailed_response(&body)); assert_eq!( response, - " [ERROR] https://example.com/404 | Generic test error: Test error for formatter testing" + " [ERROR] https://example.com/404 | URL cannot be empty: Empty URL found. Check for missing links or malformed markdown" ); } } diff --git a/lychee-bin/src/formatters/response/emoji.rs b/lychee-bin/src/formatters/response/emoji.rs index 616670ac08..0465ab9baa 100644 --- a/lychee-bin/src/formatters/response/emoji.rs +++ b/lychee-bin/src/formatters/response/emoji.rs @@ -54,7 +54,7 @@ mod emoji_tests { fn test_format_response_with_error_status() { let formatter = EmojiFormatter; let body = mock_response_body!( - Status::Error(ErrorKind::TestError), + Status::Error(ErrorKind::EmptyUrl), "https://example.com/404", ); assert_eq!( @@ -103,7 +103,7 @@ mod emoji_tests { fn test_detailed_response_output() { let formatter = EmojiFormatter; let body = mock_response_body!( - Status::Error(ErrorKind::TestError), + Status::Error(ErrorKind::EmptyUrl), "https://example.com/404", ); @@ -111,7 +111,7 @@ mod emoji_tests { assert!( formatter .format_detailed_response(&body) - .contains("Test error for formatter testing") + .contains("Empty URL found") ); } } diff --git a/lychee-bin/src/formatters/response/plain.rs b/lychee-bin/src/formatters/response/plain.rs index 104c1c0920..75979361d3 100644 --- a/lychee-bin/src/formatters/response/plain.rs +++ b/lychee-bin/src/formatters/response/plain.rs @@ -40,12 +40,12 @@ mod plain_tests { fn test_format_response_with_error_status() { let formatter = PlainFormatter; let body = mock_response_body!( - Status::Error(ErrorKind::TestError), + Status::Error(ErrorKind::EmptyUrl), "https://example.com/404", ); assert_eq!( formatter.format_response(&body), - "[ERROR] https://example.com/404 | Generic test error: Test error for formatter testing" + "[ERROR] https://example.com/404 | URL cannot be empty: Empty URL found. Check for missing links or malformed markdown" ); } diff --git a/lychee-bin/src/formatters/response/task.rs b/lychee-bin/src/formatters/response/task.rs index 0775858c62..c873913919 100644 --- a/lychee-bin/src/formatters/response/task.rs +++ b/lychee-bin/src/formatters/response/task.rs @@ -30,12 +30,12 @@ mod task_tests { fn test_format_response_with_error_status() { let formatter = TaskFormatter; let body = mock_response_body!( - Status::Error(ErrorKind::TestError), + Status::Error(ErrorKind::EmptyUrl), "https://example.com/404", ); assert_eq!( formatter.format_response(&body), - "- [ ] [ERROR] https://example.com/404 | Generic test error: Test error for formatter testing" + "- [ ] [ERROR] https://example.com/404 | URL cannot be empty: Empty URL found. Check for missing links or malformed markdown" ); } diff --git a/lychee-bin/src/formatters/stats/mod.rs b/lychee-bin/src/formatters/stats/mod.rs index 0af5600153..7ab8a2509a 100644 --- a/lychee-bin/src/formatters/stats/mod.rs +++ b/lychee-bin/src/formatters/stats/mod.rs @@ -65,7 +65,7 @@ mod tests { fn make_test_response(url_str: &str, source: ResolvedInputSource) -> Response { let uri = Uri::from(make_test_url(url_str)); - Response::new(uri, Status::Error(ErrorKind::TestError), source) + Response::new(uri, Status::Error(ErrorKind::EmptyUrl), source) } #[test] diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index 1efab972f3..b4c5f4ff7c 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -178,12 +178,6 @@ pub enum ErrorKind { /// The reason the command failed reason: String, }, - - /// Test-only error variant for formatter tests - /// Available in both test and debug builds to support cross-crate testing - #[cfg(any(test, debug_assertions))] - #[error("Generic test error")] - TestError, } impl ErrorKind { @@ -246,8 +240,6 @@ impl ErrorKind { ErrorKind::EmptyUrl => { Some("Empty URL found. Check for missing links or malformed markdown".to_string()) } - #[cfg(any(test, debug_assertions))] - ErrorKind::TestError => Some("Test error for formatter testing".to_string()), ErrorKind::InvalidFile(path) => Some(format!( "Invalid file path: '{}'. Check if file exists and is readable", path.display() @@ -420,8 +412,6 @@ impl PartialEq for ErrorKind { (Self::InvalidUrlRemap(r1), Self::InvalidUrlRemap(r2)) => r1 == r2, (Self::EmptyUrl, Self::EmptyUrl) => true, (Self::RejectedStatusCode(c1), Self::RejectedStatusCode(c2)) => c1 == c2, - #[cfg(any(test, debug_assertions))] - (Self::TestError, Self::TestError) => true, _ => false, } @@ -472,10 +462,6 @@ impl Hash for ErrorKind { Self::MissingGitHubToken | Self::InvalidUrlHost => { std::mem::discriminant(self).hash(state); } - #[cfg(any(test, debug_assertions))] - Self::TestError => { - std::mem::discriminant(self).hash(state); - } Self::Regex(e) => e.to_string().hash(state), Self::BasicAuthExtractorError(e) => e.to_string().hash(state), Self::Cookies(e) => e.hash(state), diff --git a/lychee-lib/src/utils/reqwest.rs b/lychee-lib/src/utils/reqwest.rs index 8c54203e52..070523f04b 100644 --- a/lychee-lib/src/utils/reqwest.rs +++ b/lychee-lib/src/utils/reqwest.rs @@ -411,10 +411,10 @@ mod tests { // (actual reqwest::Error creation is complex, so we test the integration point) // For other error types, ensure they still work - let test_error = ErrorKind::TestError; + let test_error = ErrorKind::EmptyUrl; assert_eq!( test_error.details(), - Some("Test error for formatter testing".to_string()) + Some("Empty URL found. Check for missing links or malformed markdown".to_string()) ); } } From b2d73a9cdb2357809dba16be302271665a202708 Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Tue, 4 Nov 2025 11:23:51 +0100 Subject: [PATCH 10/10] Update help message --- README.md | 14 ++++++-------- lychee-bin/src/options.rs | 14 ++++++-------- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 173bec4333..fef6062a1f 100644 --- a/README.md +++ b/README.md @@ -621,17 +621,15 @@ Options: #!/usr/bin/env bash case "$1" in - *.epub|*.odt|*.docx|*.ipynb) - exec pandoc "$1" --to=html --wrap=none --markdown-headings=atx - ;; - *.adoc|*.asciidoc) - asciidoctor -a stylesheet! "$1" -o - - ;; *.pdf) - exec pdftotext "$1" - + exec pdftohtml -i -s -stdout "$1" + ;; + *.odt|*.docx|*.epub|*.ipynb) + exec pandoc "$1" --to=html --wrap=none ;; *) - exec cat # identity function, output input without changes + # identity function, output input without changes + exec cat ;; esac diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index 0561edc9b0..6d97dc2c5a 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -871,17 +871,15 @@ wrapper program such as a shell script. An example script looks like this: #!/usr/bin/env bash case "$1" in -*.epub|*.odt|*.docx|*.ipynb) - exec pandoc "$1" --to=html --wrap=none --markdown-headings=atx - ;; -*.adoc|*.asciidoc) - asciidoctor -a stylesheet! "$1" -o - - ;; *.pdf) - exec pdftotext "$1" - + exec pdftohtml -i -s -stdout "$1" + ;; +*.odt|*.docx|*.epub|*.ipynb) + exec pandoc "$1" --to=html --wrap=none ;; *) - exec cat # identity function, output input without changes + # identity function, output input without changes + exec cat ;; esac"# )]