Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,15 @@ Options:
Do not show progress bar.
This is recommended for non-interactive shells (e.g. for continuous integration)

--extensions <EXTENSIONS>
Test the specified file extensions for URIs when checking files locally.

Multiple extensions can be separated by commas. Note that if you want to check filetypes,
which have multiple extensions, e.g. HTML files with both .html and .htm extensions, you need to
specify both extensions explicitly.

[default: md,mkd,mdx,mdown,mdwn,mkdn,mkdown,markdown,html,htm,txt]

--cache
Use request cache stored on disk at `.lycheecache`

Expand Down
2 changes: 1 addition & 1 deletion lychee-bin/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,7 @@ async fn run(opts: &LycheeOptions) -> Result<i32> {
collector
};

let requests = collector.collect_links(inputs);
let requests = collector.collect_links_from_file_types(inputs, opts.config.extensions.clone());

let cache = load_cache(&opts.config).unwrap_or_default();
let cache = Arc::new(cache);
Expand Down
25 changes: 23 additions & 2 deletions lychee-bin/src/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@ use clap::builder::PossibleValuesParser;
use clap::{arg, builder::TypedValueParser, Parser};
use const_format::{concatcp, formatcp};
use lychee_lib::{
Base, BasicAuthSelector, Input, StatusCodeExcluder, StatusCodeSelector, DEFAULT_MAX_REDIRECTS,
DEFAULT_MAX_RETRIES, DEFAULT_RETRY_WAIT_TIME_SECS, DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT,
Base, BasicAuthSelector, FileExtensions, FileType, Input, StatusCodeExcluder,
StatusCodeSelector, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES, DEFAULT_RETRY_WAIT_TIME_SECS,
DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT,
};
use secrecy::{ExposeSecret, SecretString};
use serde::Deserialize;
Expand Down Expand Up @@ -228,6 +229,25 @@ pub(crate) struct Config {
#[serde(default)]
pub(crate) no_progress: bool,

/// A list of file extensions. Files not matching the specified extensions are skipped.
///
/// E.g. a user can specify `--extensions html,htm,php,asp,aspx,jsp,cgi`
/// to check for links in files with these extensions.
///
/// This is useful when the default extensions are not enough and you don't
/// want to provide a long list of inputs (e.g. file1.html, file2.md, etc.)
#[arg(
long,
default_value_t = FileExtensions::default(),
long_help = "Test the specified file extensions for URIs when checking files locally.

Multiple extensions can be separated by commas. Note that if you want to check filetypes,
which have multiple extensions, e.g. HTML files with both .html and .htm extensions, you need to
specify both extensions explicitly."
)]
#[serde(default = "FileExtensions::default")]
pub(crate) extensions: FileExtensions,

#[arg(help = HELP_MSG_CACHE)]
#[arg(long)]
#[serde(default)]
Expand Down Expand Up @@ -584,6 +604,7 @@ impl Config {
cookie_jar: None;
include_fragments: false;
accept: StatusCodeSelector::default();
extensions: FileType::default_extensions();
}

if self
Expand Down
36 changes: 27 additions & 9 deletions lychee-lib/src/collector.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
use crate::ErrorKind;
use crate::InputSource;
use crate::{
basic_auth::BasicAuthExtractor, extract::Extractor, types::uri::raw::RawUri, utils::request,
Base, Input, Request, Result,
basic_auth::BasicAuthExtractor, extract::Extractor, types::uri::raw::RawUri,
types::FileExtensions, utils::request, Base, Input, Request, Result,
};
use futures::TryStreamExt;
use futures::{
Expand Down Expand Up @@ -119,28 +119,39 @@ impl Collector {
.flatten()
}

/// Convenience method to fetch all unique links from inputs
/// with the default extensions.
pub fn collect_links(self, inputs: Vec<Input>) -> impl Stream<Item = Result<Request>> {
self.collect_links_from_file_types(inputs, crate::types::FileType::default_extensions())
}

/// Fetch all unique links from inputs
/// All relative URLs get prefixed with `base` (if given).
/// (This can be a directory or a base URL)
///
/// # Errors
///
/// Will return `Err` if links cannot be extracted from an input
pub fn collect_links(self, inputs: Vec<Input>) -> impl Stream<Item = Result<Request>> {
pub fn collect_links_from_file_types(
self,
inputs: Vec<Input>,
extensions: FileExtensions,
) -> impl Stream<Item = Result<Request>> {
let skip_missing_inputs = self.skip_missing_inputs;
let skip_hidden = self.skip_hidden;
let skip_ignored = self.skip_ignored;
let global_base = self.base;
stream::iter(inputs)
.par_then_unordered(None, move |input| {
let default_base = global_base.clone();
let extensions = extensions.clone();
async move {
let base = match &input.source {
InputSource::RemoteUrl(url) => Base::try_from(url.as_str()).ok(),
_ => default_base,
};
input
.get_contents(skip_missing_inputs, skip_hidden, skip_ignored)
.get_contents(skip_missing_inputs, skip_hidden, skip_ignored, extensions)
.map(move |content| (content, base.clone()))
}
})
Expand Down Expand Up @@ -191,15 +202,19 @@ mod tests {
Ok(responses.map(|r| r.unwrap().uri).collect().await)
}

// Helper function for collecting verbatim links
/// Helper function for collecting verbatim links
///
/// A verbatim link is a link that is not parsed by the HTML parser.
/// For example, a link in a code block or a script tag.
async fn collect_verbatim(
inputs: Vec<Input>,
root_dir: Option<PathBuf>,
base: Option<Base>,
extensions: FileExtensions,
) -> Result<HashSet<Uri>> {
let responses = Collector::new(root_dir, base)?
.include_verbatim(true)
.collect_links(inputs);
.collect_links_from_file_types(inputs, extensions);
Ok(responses.map(|r| r.unwrap().uri).collect().await)
}

Expand All @@ -217,7 +232,7 @@ mod tests {
let _file = File::create(&file_path).unwrap();
let input = Input::new(&file_path.as_path().display().to_string(), None, true, None)?;
let contents: Vec<_> = input
.get_contents(true, true, true)
.get_contents(true, true, true, FileType::default_extensions())
.collect::<Vec<_>>()
.await;

Expand All @@ -230,7 +245,7 @@ mod tests {
async fn test_url_without_extension_is_html() -> Result<()> {
let input = Input::new("https://example.com/", None, true, None)?;
let contents: Vec<_> = input
.get_contents(true, true, true)
.get_contents(true, true, true, FileType::default_extensions())
.collect::<Vec<_>>()
.await;

Expand Down Expand Up @@ -288,7 +303,10 @@ mod tests {
},
];

let links = collect_verbatim(inputs, None, None).await.ok().unwrap();
let links = collect_verbatim(inputs, None, None, FileType::default_extensions())
.await
.ok()
.unwrap();

let expected_links = HashSet::from_iter([
website(TEST_STRING),
Expand Down
6 changes: 3 additions & 3 deletions lychee-lib/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,8 @@ pub use crate::{
filter::{Excludes, Filter, Includes},
types::{
uri::valid::Uri, AcceptRange, AcceptRangeError, Base, BasicAuthCredentials,
BasicAuthSelector, CacheStatus, CookieJar, ErrorKind, FileType, Input, InputContent,
InputSource, Request, Response, ResponseBody, Result, Status, StatusCodeExcluder,
StatusCodeSelector,
BasicAuthSelector, CacheStatus, CookieJar, ErrorKind, FileExtensions, FileType, Input,
InputContent, InputSource, Request, Response, ResponseBody, Result, Status,
StatusCodeExcluder, StatusCodeSelector,
},
};
Loading
Loading