Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -609,6 +609,30 @@ Options:
--offline
Only check local files and block network requests

-p, --preprocess <COMMAND>
Preprocess input files.
For each file input, this flag causes lychee to execute `COMMAND PATH` and process
its standard output instead of the original contents of PATH. This allows you to
convert files that would otherwise not be understood by lychee. The preprocessor
COMMAND is only run on input files, not on standard input or URLs.

To invoke programs with custom arguments or to use multiple preprocessors, use a
wrapper program such as a shell script. An example script looks like this:

#!/usr/bin/env bash
case "$1" in
*.pdf)
exec pdftohtml -i -s -stdout "$1"
;;
*.odt|*.docx|*.epub|*.ipynb)
exec pandoc "$1" --to=html --wrap=none
;;
*)
# identity function, output input without changes
exec cat
;;
esac

-q, --quiet...
Less output per occurrence (e.g. `-q` or `-qq`)

Expand Down
1 change: 1 addition & 0 deletions benches/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ publish = false
[dependencies]
lychee-lib = { path = "../lychee-lib", default-features = false }
criterion = "0.7.0"
tokio = "1.48.0"

[features]
email-check = ["lychee-lib/email-check"]
Expand Down
34 changes: 21 additions & 13 deletions benches/src/extract.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
use criterion::{Criterion, criterion_group, criterion_main};
use lychee_lib::extract::Extractor;
use lychee_lib::{FileType, InputContent};
use lychee_lib::{FileType, Input, InputContent};
use std::hint::black_box;
use std::path::PathBuf;

fn extract(paths: &[PathBuf]) {
for path in paths {
let content: InputContent = path.try_into().unwrap();
fn extract(inputs: &Vec<InputContent>) {
for input in inputs {
let extractor = Extractor::default();
let extracted = extractor.extract(&content);
let extracted = extractor.extract(input);
println!("{}", extracted.len());
}
}
Expand Down Expand Up @@ -36,14 +34,24 @@ fn benchmark_input_content_creation(c: &mut Criterion) {

fn benchmark(c: &mut Criterion) {
// Currently Wikipedia's biggest featured article
c.bench_function("extract from large docs", |b| {
b.iter(|| {
extract(black_box(&[
PathBuf::from("../fixtures/bench/elvis.html"),
PathBuf::from("../fixtures/bench/arch.html"),
]))
})
let mut inputs = vec![];

let runtime = tokio::runtime::Builder::new_current_thread()
.build()
.unwrap();

runtime.block_on(async {
inputs = vec![
Input::path_content("../fixtures/bench/elvis.html", None)
.await
.unwrap(),
Input::path_content("../fixtures/bench/arch.html", None)
.await
.unwrap(),
];
});

c.bench_function("extract from large docs", |b| b.iter(|| extract(&inputs)));
}

criterion_group!(
Expand Down
5 changes: 5 additions & 0 deletions fixtures/pre/error_message.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/usr/bin/env bash

>&2 echo "Some error message"

exit 1
3 changes: 3 additions & 0 deletions fixtures/pre/no_error_message.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/usr/bin/env bash

exit 1
4 changes: 2 additions & 2 deletions lychee-bin/src/commands/dump_inputs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ pub(crate) async fn dump_inputs(
excluded_paths: &[String],
file_extensions: &FileExtensions,
skip_hidden: bool,
skip_gitignored: bool,
skip_ignored: bool,
) -> Result<ExitCode> {
if let Some(out_file) = output {
fs::File::create(out_file)?;
Expand All @@ -36,7 +36,7 @@ pub(crate) async fn dump_inputs(
let sources_stream = input.get_sources(
file_extensions.clone(),
skip_hidden,
skip_gitignored,
skip_ignored,
&excluded_path_filter,
);
tokio::pin!(sources_stream);
Expand Down
6 changes: 3 additions & 3 deletions lychee-bin/src/formatters/response/color.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ mod tests {
fn test_format_response_with_error_status() {
let formatter = ColorFormatter;
let body = mock_response_body!(
Status::Error(ErrorKind::TestError),
Status::Error(ErrorKind::EmptyUrl),
"https://example.com/404",
);
let formatted_response = strip_ansi_codes(&formatter.format_response(&body));
Expand All @@ -118,14 +118,14 @@ mod tests {
fn test_detailed_response_output() {
let formatter = ColorFormatter;
let body = mock_response_body!(
Status::Error(ErrorKind::TestError),
Status::Error(ErrorKind::EmptyUrl),
"https://example.com/404",
);

let response = strip_ansi_codes(&formatter.format_detailed_response(&body));
assert_eq!(
response,
" [ERROR] https://example.com/404 | Generic test error: Test error for formatter testing"
" [ERROR] https://example.com/404 | URL cannot be empty: Empty URL found. Check for missing links or malformed markdown"
);
}
}
6 changes: 3 additions & 3 deletions lychee-bin/src/formatters/response/emoji.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ mod emoji_tests {
fn test_format_response_with_error_status() {
let formatter = EmojiFormatter;
let body = mock_response_body!(
Status::Error(ErrorKind::TestError),
Status::Error(ErrorKind::EmptyUrl),
"https://example.com/404",
);
assert_eq!(
Expand Down Expand Up @@ -103,15 +103,15 @@ mod emoji_tests {
fn test_detailed_response_output() {
let formatter = EmojiFormatter;
let body = mock_response_body!(
Status::Error(ErrorKind::TestError),
Status::Error(ErrorKind::EmptyUrl),
"https://example.com/404",
);

// Just assert the output contains the expected error message
assert!(
formatter
.format_detailed_response(&body)
.contains("Test error for formatter testing")
.contains("Empty URL found")
);
}
}
4 changes: 2 additions & 2 deletions lychee-bin/src/formatters/response/plain.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,12 @@ mod plain_tests {
fn test_format_response_with_error_status() {
let formatter = PlainFormatter;
let body = mock_response_body!(
Status::Error(ErrorKind::TestError),
Status::Error(ErrorKind::EmptyUrl),
"https://example.com/404",
);
assert_eq!(
formatter.format_response(&body),
"[ERROR] https://example.com/404 | Generic test error: Test error for formatter testing"
"[ERROR] https://example.com/404 | URL cannot be empty: Empty URL found. Check for missing links or malformed markdown"
);
}

Expand Down
4 changes: 2 additions & 2 deletions lychee-bin/src/formatters/response/task.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@ mod task_tests {
fn test_format_response_with_error_status() {
let formatter = TaskFormatter;
let body = mock_response_body!(
Status::Error(ErrorKind::TestError),
Status::Error(ErrorKind::EmptyUrl),
"https://example.com/404",
);
assert_eq!(
formatter.format_response(&body),
"- [ ] [ERROR] https://example.com/404 | Generic test error: Test error for formatter testing"
"- [ ] [ERROR] https://example.com/404 | URL cannot be empty: Empty URL found. Check for missing links or malformed markdown"
);
}

Expand Down
2 changes: 1 addition & 1 deletion lychee-bin/src/formatters/stats/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ mod tests {
fn make_test_response(url_str: &str, source: ResolvedInputSource) -> Response {
let uri = Uri::from(make_test_url(url_str));

Response::new(uri, Status::Error(ErrorKind::TestError), source)
Response::new(uri, Status::Error(ErrorKind::EmptyUrl), source)
}

#[test]
Expand Down
3 changes: 2 additions & 1 deletion lychee-bin/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,8 @@ async fn run(opts: &LycheeOptions) -> Result<i32> {
.excluded_paths(PathExcludes::new(opts.config.exclude_path.clone())?)
// File a bug if you rely on this envvar! It's going to go away eventually.
.use_html5ever(std::env::var("LYCHEE_USE_HTML5EVER").is_ok_and(|x| x == "1"))
.include_wikilinks(opts.config.include_wikilinks);
.include_wikilinks(opts.config.include_wikilinks)
.preprocessor(opts.config.preprocess.clone());

collector = if let Some(ref basic_auth) = opts.config.basic_auth {
collector.basic_auth_extractor(BasicAuthExtractor::new(basic_auth)?)
Expand Down
33 changes: 33 additions & 0 deletions lychee-bin/src/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use http::{
HeaderMap,
header::{HeaderName, HeaderValue},
};
use lychee_lib::Preprocessor;
use lychee_lib::{
Base, BasicAuthSelector, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES,
DEFAULT_RETRY_WAIT_TIME_SECS, DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT, FileExtensions,
Expand Down Expand Up @@ -853,6 +854,37 @@ and existing cookies will be updated."
#[arg(long)]
#[serde(default)]
pub(crate) include_wikilinks: bool,

/// Preprocess input files.
#[arg(
short,
long,
value_name = "COMMAND",
long_help = r#"Preprocess input files.
For each file input, this flag causes lychee to execute `COMMAND PATH` and process
its standard output instead of the original contents of PATH. This allows you to
convert files that would otherwise not be understood by lychee. The preprocessor
COMMAND is only run on input files, not on standard input or URLs.

To invoke programs with custom arguments or to use multiple preprocessors, use a
wrapper program such as a shell script. An example script looks like this:

#!/usr/bin/env bash
case "$1" in
*.pdf)
exec pdftohtml -i -s -stdout "$1"
;;
*.odt|*.docx|*.epub|*.ipynb)
exec pandoc "$1" --to=html --wrap=none
;;
*)
# identity function, output input without changes
exec cat
;;
esac"#
)]
#[serde(default)]
pub(crate) preprocess: Option<Preprocessor>,
}

impl Config {
Expand Down Expand Up @@ -943,6 +975,7 @@ impl Config {
no_progress: false,
offline: false,
output: None,
preprocess: None,
remap: Vec::<String>::new(),
require_https: false,
retry_wait_time: DEFAULT_RETRY_WAIT_TIME_SECS,
Expand Down
73 changes: 67 additions & 6 deletions lychee-bin/tests/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ mod cli {
/// Assert actual output lines equals to expected lines.
/// Order of the lines is ignored.
fn assert_lines_eq<S: AsRef<str> + Ord>(result: Assert, mut expected_lines: Vec<S>) {
let output = result.get_output().stdout.clone();
let output = &result.get_output().stdout;
let mut actual_lines: Vec<String> = output
.lines()
.map(|line| line.unwrap().to_string())
Expand Down Expand Up @@ -1140,11 +1140,7 @@ mod cli {

// Clean up
fs::remove_file(&cache_file).map_err(|e| {
anyhow::anyhow!(
"Failed to remove cache file: {:?}, error: {}",
cache_file,
e
)
anyhow::anyhow!("Failed to remove cache file: {cache_file:?}, error: {e}")
})?;

Ok(())
Expand Down Expand Up @@ -3025,4 +3021,69 @@ mod cli {

Ok(())
}

/// Preprocessing with `cat` is like an identity function because it
/// outputs its input without any changes.
#[test]
fn test_pre_cat() {
let file = fixtures_path!().join("TEST.md");
let pre_with_cat = main_command!()
.arg("--preprocess")
.arg("cat")
.arg("--dump")
.arg(&file)
.assert()
.success();

let no_pre = main_command!()
.arg("--dump")
.arg(&file)
.assert()
.success()
.get_output()
.stdout
.lines()
.map(|line| line.unwrap().to_string())
.collect();

assert_lines_eq(pre_with_cat, no_pre);
}

#[test]
fn test_pre_invalid_command() {
let file = fixtures_path!().join("TEST.md");
main_command!()
.arg("--preprocess")
.arg("program does not exist")
.arg(file)
.assert()
.failure()
.stderr(contains("Error: Preprocessor command 'program does not exist' failed: could not start: No such file or directory (os error 2)"));
}

#[test]
fn test_pre_error() {
let file = fixtures_path!().join("TEST.md");
let script = fixtures_path!().join("pre").join("no_error_message.sh");
main_command!()
.arg("--preprocess")
.arg(&script)
.arg(&file)
.assert()
.failure()
.stderr(contains(format!(
"Error: Preprocessor command '{}' failed: exited with non-zero code: <empty stderr>", script.as_os_str().to_str().unwrap()
)));

let script = fixtures_path!().join("pre").join("error_message.sh");
main_command!()
.arg("--preprocess")
.arg(&script)
.arg(file)
.assert()
.failure()
.stderr(contains(format!(
"Error: Preprocessor command '{}' failed: exited with non-zero code: Some error message", script.as_os_str().to_str().unwrap()
)));
}
}
Loading