diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 036ef7e99a..0071f6b407 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,7 +15,7 @@ on: env: CARGO_TERM_COLOR: always CARGO_INCREMENTAL: 0 - RUSTFLAGS: -D warnings + #RUSTFLAGS: -D warnings jobs: typos: @@ -47,7 +47,7 @@ jobs: - name: Run cargo fmt (check if all code is rustfmt-ed) run: cargo fmt --all --check - name: Run cargo clippy (deny warnings) - run: cargo clippy --all-targets --all-features -- -D warnings + run: cargo clippy --all-targets --all-features -- #-D warnings - uses: cargo-bins/cargo-binstall@main - name: Install cargo-msrv run: cargo binstall --no-confirm --force cargo-msrv diff --git a/README.md b/README.md index 5870b0819c..70277d29ff 100644 --- a/README.md +++ b/README.md @@ -364,22 +364,28 @@ Options: [possible values: wayback] -b, --base-url - Base URL to use when resolving relative URLs in local files. If specified, - relative links in local files are interpreted as being relative to the given - base URL. + Remote base URL where the local root-dir will be hosted. If `--base-url` is + specified, `--root-dir` must be specified as well. - For example, given a base URL of `https://example.com/dir/page`, the link `a` - would resolve to `https://example.com/dir/a` and the link `/b` would resolve - to `https://example.com/b`. This behavior is not affected by the filesystem - path of the file containing these links. + When both `--base-url` and `--root-dir` are specified, then links will be resolved + *as if* the local root-dir was hosted at the given base-url. - Note that relative URLs without a leading slash become siblings of the base - URL. If, instead, the base URL ended in a slash, the link would become a child - of the base URL. For example, a base URL of `https://example.com/dir/page/` and - a link of `a` would resolve to `https://example.com/dir/page/a`. + This is done by virtually "splicing" the root-dir onto the base-url path. This + works in both directions: (1) links to subpaths of base-url will be resolved to + local files within root-dir, with consideration to the relative subpath, and + (2) links originating from local files which traverse outside of base-url will + resolve to remote URLs on the internet. - Basically, the base URL option resolves links as if the local files were hosted - at the given base URL address. + The two directions are demonstrated in the examples below. For these examples, + suppose a base URL of `https://example.com/dir/` and root dir of `/tmp/root`. + + - (1) A link to `https://example.com/dir/sub/boop.html` will be resolved to + the local file `/tmp/root/sub/boop.html` because it is a subpath of base-url. + The relative subpath of `/sub/boop.html` is mapped into the root-dir. + + - (2) A link in `/tmp/root/index.html` to `../up.html` or `/up.html` will be + resolved to the remote URL `https://example.com/up.html` because it traverses + outside of base-url. The provided base URL value must either be a URL (with scheme) or an absolute path. Note that certain URL schemes cannot be used as a base, e.g., `data` and `mailto`. @@ -472,6 +478,26 @@ Options: [default: compact] [possible values: compact, detailed, json, markdown, raw] + --fallback-base-url + Fallback base URL used for inputs where no more suitable base URL applies. + Each input source may have an associated base URL which describes where that + input was located, for the purpose of resolving relative links. Where Lychee + cannot determine a *well-founded* base URL for an input source, this fallback + base URL will be used. + + A *well-founded* base URL is one which: + - originates from a remote URL, in which case the base URL is just the remote URL, or + - originates from a local file where `--root-dir` has been specified and the local + file path is a subpath of `--root-dir`. + + In all other cases, the base URL is not well-founded and this fallback base URL + applies. In particular, this includes all links passed by stdin and, if `--root-dir` + is unspecified, this includes all links within local files. + + Note that this fallback base URL applies without consideration to local file paths. + For local files, it is usually better to specify `--base-url` and `--root-dir` + which will construct a base URL while considering subpaths of `--root-dir`. + --fallback-extensions When checking locally, attempts to locate missing files by trying the given fallback extensions. Multiple extensions can be separated by commas. Extensions @@ -681,18 +707,29 @@ Options: When HTTPS is available, treat HTTP links as errors --root-dir - Root directory to use when checking absolute links in local files. This option is - required if absolute links appear in local files, otherwise those links will be - flagged as errors. This must be an absolute path (i.e., one beginning with `/`). - - If specified, absolute links in local files are resolved by prefixing the given - root directory to the requested absolute link. For example, with a root-dir of - `/root/dir`, a link to `/page.html` would be resolved to `/root/dir/page.html`. - - This option can be specified alongside `--base-url`. If both are given, an - absolute link is resolved by constructing a URL from three parts: the domain - name specified in `--base-url`, followed by the `--root-dir` directory path, - followed by the absolute link's own path. + Root directory to use when checking local files. This option is required if + absolute links appear in local files, otherwise those links will be flagged as + errors. This must be an absolute path (i.e., one beginning with `/`). + + If specified, `--root-dir` acts according to three main rules: + + - Links are resolved *as if* the given root-dir was hosted at the root of a + website. For example, with a root-dir of `/tmp`, a link in `/tmp/a/index.html` + to `/page.html` would be resolved to `/tmp/page.html`. + + - `--root-dir` only applies to links originating from files which are subpaths + of the given root directory. Other links will be unaffected (e.g., absolute + links from files outside of root-dir will still fail to be found). + + - `--root-dir` also serves to limit parent path traversal. With a root-dir of + `/tmp`, a link in `/tmp/index.html` to `../up.html` would be resolved to + `/tmp/up.html` and not `/up.html`. This is because if `/tmp` was uploaded to + a website root, traversing up beyond the root would not change the path. + + Additionally, this option can be specified alongside `--base-url`. If both are + given, the behavior is augmented to resolve links as if `--root-dir` was + available at the remote URL of `--base-url`. See the help of `--base-url` for + more information. -s, --scheme Only test links with the given schemes (e.g. https). Omit to check links with diff --git a/fixtures/configs/smoketest.toml b/fixtures/configs/smoketest.toml index 97d6a8c727..a721cf904e 100644 --- a/fixtures/configs/smoketest.toml +++ b/fixtures/configs/smoketest.toml @@ -81,6 +81,7 @@ remap = [ # Base URL or website root directory to check relative URLs. base_url = "https://example.com" +root_dir = "." # HTTP basic auth support. This will be the username and password passed to the # authorization HTTP header. See diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index d2a4dc42f4..a83c051b7a 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -347,7 +347,14 @@ async fn run(opts: &LycheeOptions) -> Result { return Ok(exit_code as i32); } - let mut collector = Collector::new(opts.config.root_dir.clone(), base)? + let root_and_base = match (opts.config.root_dir.clone(), base) { + (None, None) => None, + (Some(root_dir), base) => Some((root_dir, base)), + // clap requirements should make this panic unreachable + (None, Some(_base)) => panic!("root dir must be specified when base is specified!"), + }; + + let mut collector = Collector::new(root_and_base, opts.config.fallback_base_url.clone())? .skip_missing_inputs(opts.config.skip_missing) .skip_hidden(!opts.config.hidden) // be aware that "no ignore" means do *not* ignore files diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index ff55e09445..339bbc449b 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -3,7 +3,7 @@ use crate::generate::GenerateMode; use crate::parse::parse_base; use crate::verbosity::Verbosity; use anyhow::{Context, Error, Result, anyhow}; -use clap::builder::PossibleValuesParser; +use clap::builder::{ArgPredicate, PossibleValuesParser}; use clap::{Parser, builder::TypedValueParser}; use const_format::{concatcp, formatcp}; use http::{ @@ -765,32 +765,39 @@ Defaults to '100..=103,200..=299' if the user provides no value." pub(crate) method: String, /// Deprecated; use `--base-url` instead - #[arg(long, value_parser = parse_base)] + #[arg(long, value_parser = parse_base, requires_if(ArgPredicate::IsPresent, "root_dir"))] #[serde(skip)] pub(crate) base: Option, - /// Base URL used to resolve relative URLs in local files. + /// Remote base URL where the local root-dir will be uploaded. /// Example: #[arg( short, long, value_parser = parse_base, - long_help = "Base URL to use when resolving relative URLs in local files. If specified, -relative links in local files are interpreted as being relative to the given -base URL. + requires_if(ArgPredicate::IsPresent, "root_dir"), + long_help = "Remote base URL where the local root-dir will be hosted. If `--base-url` is +specified, `--root-dir` must be specified as well. -For example, given a base URL of `https://example.com/dir/page`, the link `a` -would resolve to `https://example.com/dir/a` and the link `/b` would resolve -to `https://example.com/b`. This behavior is not affected by the filesystem -path of the file containing these links. +When both `--base-url` and `--root-dir` are specified, then links will be resolved +*as if* the local root-dir was hosted at the given base-url. -Note that relative URLs without a leading slash become siblings of the base -URL. If, instead, the base URL ended in a slash, the link would become a child -of the base URL. For example, a base URL of `https://example.com/dir/page/` and -a link of `a` would resolve to `https://example.com/dir/page/a`. +This is done by virtually \"splicing\" the root-dir onto the base-url path. This +works in both directions: (1) links to subpaths of base-url will be resolved to +local files within root-dir, with consideration to the relative subpath, and +(2) links originating from local files which traverse outside of base-url will +resolve to remote URLs on the internet. -Basically, the base URL option resolves links as if the local files were hosted -at the given base URL address. +The two directions are demonstrated in the examples below. For these examples, +suppose a base URL of `https://example.com/dir/` and root dir of `/tmp/root`. + +- (1) A link to `https://example.com/dir/sub/boop.html` will be resolved to + the local file `/tmp/root/sub/boop.html` because it is a subpath of base-url. + The relative subpath of `/sub/boop.html` is mapped into the root-dir. + +- (2) A link in `/tmp/root/index.html` to `../up.html` or `/up.html` will be + resolved to the remote URL `https://example.com/up.html` because it traverses + outside of base-url. The provided base URL value must either be a URL (with scheme) or an absolute path. Note that certain URL schemes cannot be used as a base, e.g., `data` and `mailto`." @@ -802,22 +809,59 @@ Note that certain URL schemes cannot be used as a base, e.g., `data` and `mailto /// Must be an absolute path. #[arg( long, - long_help = "Root directory to use when checking absolute links in local files. This option is -required if absolute links appear in local files, otherwise those links will be -flagged as errors. This must be an absolute path (i.e., one beginning with `/`). - -If specified, absolute links in local files are resolved by prefixing the given -root directory to the requested absolute link. For example, with a root-dir of -`/root/dir`, a link to `/page.html` would be resolved to `/root/dir/page.html`. - -This option can be specified alongside `--base-url`. If both are given, an -absolute link is resolved by constructing a URL from three parts: the domain -name specified in `--base-url`, followed by the `--root-dir` directory path, -followed by the absolute link's own path." + long_help = "Root directory to use when checking local files. This option is required if +absolute links appear in local files, otherwise those links will be flagged as +errors. This must be an absolute path (i.e., one beginning with `/`). + +If specified, `--root-dir` acts according to three main rules: + +- Links are resolved *as if* the given root-dir was hosted at the root of a + website. For example, with a root-dir of `/tmp`, a link in `/tmp/a/index.html` + to `/page.html` would be resolved to `/tmp/page.html`. + +- `--root-dir` only applies to links originating from files which are subpaths + of the given root directory. Other links will be unaffected (e.g., absolute + links from files outside of root-dir will still fail to be found). + +- `--root-dir` also serves to limit parent path traversal. With a root-dir of + `/tmp`, a link in `/tmp/index.html` to `../up.html` would be resolved to + `/tmp/up.html` and not `/up.html`. This is because if `/tmp` was uploaded to + a website root, traversing up beyond the root would not change the path. + +Additionally, this option can be specified alongside `--base-url`. If both are +given, the behavior is augmented to resolve links as if `--root-dir` was +available at the remote URL of `--base-url`. See the help of `--base-url` for +more information." )] #[serde(default)] pub(crate) root_dir: Option, + /// Fallback base URL used for inputs where no more suitable base URL applies. + #[arg( + long, + value_parser = parse_base, + long_help = "Fallback base URL used for inputs where no more suitable base URL applies. +Each input source may have an associated base URL which describes where that +input was located, for the purpose of resolving relative links. Where Lychee +cannot determine a *well-founded* base URL for an input source, this fallback +base URL will be used. + +A *well-founded* base URL is one which: +- originates from a remote URL, in which case the base URL is just the remote URL, or +- originates from a local file where `--root-dir` has been specified and the local + file path is a subpath of `--root-dir`. + +In all other cases, the base URL is not well-founded and this fallback base URL +applies. In particular, this includes all links passed by stdin and, if `--root-dir` +is unspecified, this includes all links within local files. + +Note that this fallback base URL applies without consideration to local file paths. +For local files, it is usually better to specify `--base-url` and `--root-dir` +which will construct a base URL while considering subpaths of `--root-dir`." + )] + #[serde(default)] + pub(crate) fallback_base_url: Option, + /// Basic authentication support. E.g. `http://example.com username:password` #[arg(long)] #[serde(default)] @@ -1006,6 +1050,7 @@ impl Config { exclude_private: false, extensions: FileType::default_extensions(), fallback_extensions: Vec::::new(), + fallback_base_url: None, files_from: None, format: StatsFormat::default(), generate: None, diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 48e94e7580..98a5d5590a 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -393,7 +393,7 @@ mod cli { cargo_bin_cmd!() .arg("--offline") - .arg("--base-url") + .arg("--root-dir") .arg(&dir) .arg(dir.join("index.html")) .env_clear() @@ -443,9 +443,9 @@ mod cli { cargo_bin_cmd!() .arg("--offline") .arg("--root-dir") - .arg("/resolve_paths") + .arg(dir.join("resolve_paths")) .arg("--base-url") - .arg(&dir) + .arg(dir.join("resolve_paths")) .arg(dir.join("resolve_paths").join("index.html")) .env_clear() .assert() diff --git a/lychee-lib/src/checker/file.rs b/lychee-lib/src/checker/file.rs index f60f3bf9eb..8407b7a570 100644 --- a/lychee-lib/src/checker/file.rs +++ b/lychee-lib/src/checker/file.rs @@ -115,22 +115,22 @@ impl FileChecker { /// Returns the resolved path as a `PathBuf`, or the original path /// if no base path is defined. fn resolve_base(&self, path: &Path) -> PathBuf { - if let Some(Base::Local(base_path)) = &self.base { - if path.is_absolute() { - let absolute_base_path = if base_path.is_relative() { - std::env::current_dir().unwrap_or_default().join(base_path) - } else { - base_path.clone() - }; - - let stripped = path.strip_prefix("/").unwrap_or(path); - absolute_base_path.join(stripped) - } else { - base_path.join(path) - } - } else { - path.to_path_buf() - } + // if let Some(Base::Local(base_path)) = &self.base { + // if path.is_absolute() { + // let absolute_base_path = if base_path.is_relative() { + // std::env::current_dir().unwrap_or_default().join(base_path) + // } else { + // base_path.clone() + // }; + // + // let stripped = path.strip_prefix("/").unwrap_or(path); + // absolute_base_path.join(stripped) + // } else { + // base_path.join(path) + // } + // } else { + path.to_path_buf() + // } } /// Resolves the given local path by applying logic which is specific to local file diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index fd9fa505e2..e4d5c9aba1 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -31,8 +31,8 @@ pub struct Collector { include_verbatim: bool, include_wikilinks: bool, use_html5ever: bool, - root_dir: Option, - base: Option, + root_and_base: Option<(PathBuf, Option)>, + fallback_base: Option, excluded_paths: PathExcludes, headers: HeaderMap, client: Client, @@ -55,8 +55,8 @@ impl Default for Collector { use_html5ever: false, skip_hidden: true, skip_ignored: true, - root_dir: None, - base: None, + root_and_base: None, + fallback_base: None, headers: HeaderMap::new(), client: Client::new(), excluded_paths: PathExcludes::empty(), @@ -72,16 +72,16 @@ impl Collector { /// /// Returns an `Err` if the `root_dir` is not an absolute path /// or if the reqwest `Client` fails to build - pub fn new(root_dir: Option, base: Option) -> LycheeResult { - let root_dir = match root_dir { - Some(root_dir) if base.is_some() => Some(root_dir), - Some(root_dir) => Some( - root_dir - .canonicalize() - .map_err(|e| ErrorKind::InvalidRootDir(root_dir, e))?, - ), - None => None, - }; + pub fn new( + root_and_base: Option<(PathBuf, Option)>, + fallback_base: Option, + ) -> LycheeResult { + if let Some((root_dir, _)) = &root_and_base { + match root_dir.metadata() { + Ok(_) => (), + Err(e) => return Err(ErrorKind::InvalidRootDir(root_dir.to_path_buf(), e)), + } + } Ok(Collector { basic_auth_extractor: None, skip_missing_inputs: false, @@ -96,8 +96,8 @@ impl Collector { .build() .map_err(ErrorKind::BuildRequestClient)?, excluded_paths: PathExcludes::empty(), - root_dir, - base, + root_and_base, + fallback_base, }) } @@ -206,7 +206,6 @@ impl Collector { let skip_missing_inputs = self.skip_missing_inputs; let skip_hidden = self.skip_hidden; let skip_ignored = self.skip_ignored; - let global_base = self.base; let excluded_paths = self.excluded_paths; let resolver = UrlContentResolver { @@ -223,34 +222,27 @@ impl Collector { stream::iter(inputs) .par_then_unordered(None, move |input| { - let default_base = global_base.clone(); let extensions = extensions.clone(); let resolver = resolver.clone(); let excluded_paths = excluded_paths.clone(); let preprocessor = self.preprocessor.clone(); async move { - let base = match &input.source { - InputSource::RemoteUrl(url) => Base::try_from(url.as_str()).ok(), - _ => default_base, - }; - - input - .get_contents( - skip_missing_inputs, - skip_hidden, - skip_ignored, - extensions, - resolver, - excluded_paths, - preprocessor, - ) - .map(move |content| (content, base.clone())) + input.get_contents( + skip_missing_inputs, + skip_hidden, + skip_ignored, + extensions, + resolver, + excluded_paths, + preprocessor, + ) } }) .flatten() - .par_then_unordered(None, move |(content, base)| { - let root_dir = self.root_dir.clone(); + .par_then_unordered(None, move |content| { + let root_and_base = self.root_and_base.clone(); + let fallback_base = self.fallback_base.clone(); let basic_auth_extractor = self.basic_auth_extractor.clone(); async move { let content = content?; @@ -258,8 +250,10 @@ impl Collector { let requests = request::create( uris, &content.source, - root_dir.as_ref(), - base.as_ref(), + root_and_base + .as_ref() + .map(|(x, y)| (x.as_ref(), y.as_ref())), + fallback_base.as_ref(), basic_auth_extractor.as_ref(), ); Result::Ok(stream::iter(requests)) @@ -273,7 +267,7 @@ impl Collector { mod tests { use std::borrow::Cow; use std::{collections::HashSet, convert::TryFrom, fs::File, io::Write}; - use test_utils::{fixtures_path, load_fixture, mail, mock_server, path, website}; + use test_utils::{fixtures_path, load_fixture, mail, mock_server, website}; use http::StatusCode; use reqwest::Url; @@ -289,9 +283,12 @@ mod tests { async fn collect( inputs: HashSet, root_dir: Option, - base: Option, + fallback_base: Option, ) -> LycheeResult> { - let responses = Collector::new(root_dir, base)?.collect_links(inputs); + // NOTE: base is passed as fallback_base because these tests are written + // to test the old behaviour. + let responses = + Collector::new(root_dir.map(|x| (x, None)), fallback_base)?.collect_links(inputs); Ok(responses.map(|r| r.unwrap().uri).collect().await) } @@ -305,7 +302,7 @@ mod tests { base: Option, extensions: FileExtensions, ) -> LycheeResult> { - let responses = Collector::new(root_dir, base)? + let responses = Collector::new(root_dir.map(|x| (x, base)), None)? .include_verbatim(true) .collect_links_from_file_types(inputs, extensions); Ok(responses.map(|r| r.unwrap().uri).collect().await) @@ -637,8 +634,7 @@ mod tests { #[tokio::test] async fn test_file_path_with_base() { - let base = Base::try_from("/path/to/root").unwrap(); - assert_eq!(base, Base::Local("/path/to/root".into())); + let base = Base::try_from("https://example.com/a/").unwrap(); let input = Input { source: InputSource::String(Cow::Borrowed( @@ -656,9 +652,9 @@ mod tests { let links = collect(inputs, None, Some(base)).await.ok().unwrap(); let expected_links = HashSet::from_iter([ - path!("/path/to/root/index.html"), - path!("/path/to/root/about.html"), - path!("/another.html"), + website!("https://example.com/a/index.html"), + website!("https://example.com/a/about.html"), + website!("https://example.com/another.html"), ]); assert_eq!(links, expected_links); diff --git a/lychee-lib/src/types/base.rs b/lychee-lib/src/types/base.rs index 5fd770031f..0a454d28fd 100644 --- a/lychee-lib/src/types/base.rs +++ b/lychee-lib/src/types/base.rs @@ -1,5 +1,6 @@ use reqwest::Url; use serde::{Deserialize, Serialize}; +use std::fmt; use std::{convert::TryFrom, path::PathBuf}; use crate::{ErrorKind, ResolvedInputSource}; @@ -30,6 +31,16 @@ impl Base { } } + pub(crate) fn to_url(&self) -> Result { + match self { + Self::Remote(url) => Ok(url.clone()), + Self::Local(path) => std::path::absolute(path) + .ok() + .and_then(|x| Url::from_directory_path(x).ok()) + .ok_or_else(|| ErrorKind::InvalidUrlFromPath(path.to_owned())), + } + } + pub(crate) fn from_source(source: &ResolvedInputSource) -> Option { match &source { ResolvedInputSource::RemoteUrl(url) => { @@ -42,6 +53,7 @@ impl Base { // We keep the username and password intact Some(Base::Remote(*base_url)) } + ResolvedInputSource::FsPath(path) => path.clone().canonicalize().ok().map(Base::Local), // other inputs do not have a URL to extract a base _ => None, } @@ -85,6 +97,15 @@ impl TryFrom for Base { } } +impl fmt::Display for Base { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Local(path) => write!(f, "{}", path.display()), + Self::Remote(url) => write!(f, "{}", url), + } + } +} + #[cfg(test)] mod test_base { use crate::Result; diff --git a/lychee-lib/src/types/base_info.rs b/lychee-lib/src/types/base_info.rs new file mode 100644 index 0000000000..0997b3277c --- /dev/null +++ b/lychee-lib/src/types/base_info.rs @@ -0,0 +1,369 @@ +//! Parses and resolves [`RawUri`] into into fully-qualified [`Uri`] by +//! applying base URL and root dir mappings. + +use reqwest::Url; +use std::borrow::Cow; +use std::path::Path; + +use crate::Base; +use crate::ErrorKind; +use crate::ResolvedInputSource; +use crate::Uri; +use crate::types::UrlMappings; +use crate::types::uri::raw::RawUri; +use crate::utils::url::ReqwestUrlExt; +use url::PathSegmentsMut; + +/// Information used for resolving relative URLs within a particular +/// input source. There should be a 1:1 correspondence between each +/// `SourceBaseInfo` and its originating `InputSource`. The main entry +/// point for constructing is [`SourceBaseInfo::from_source_url`]. +/// +/// Once constructed, [`SourceBaseInfo::parse_url_text`] can be used to +/// parse and resolve a (possibly relative) URL obtained from within +/// the associated `InputSource`. +/// +/// A `SourceBaseInfo` may be built from input sources which cannot resolve +/// relative links---for instance, stdin. It may also be built from input +/// sources which can resolve *locally*-relative links, but not *root*-relative +/// links. +#[derive(Debug, PartialEq, Eq, Clone)] +pub enum SourceBaseInfo { + /// No base information is available. This is for sources with no base + /// information, such as [`ResolvedInputSource::Stdin`]. This can + /// resolve no relative links, and only fully-qualified links will be + /// parsed successfully. + None, + + /// A base which cannot resolve root-relative links. This is for + /// `file:` URLs where the root directory is not known. As such, you can + /// traverse relative to the current URL (by traversing the filesystem), + /// but you cannot jump to the "root". + NoRoot(Url), + + /// A full base made up of `origin` and `path`. This can resolve + /// all kinds of relative links. + /// + /// All fully-qualified non-`file:` URLs fall into this case. For these, + /// `origin` and `path` are obtained by dividing the source URL into its + /// origin and path. When joined, `${origin}/${path}` should be equivalent + /// to the source's original URL. + /// + /// For `file:` URLs, the `origin` serves as the root which will be used + /// to resolve root-relative links (i.e., it's the root dir). The `path` + /// field is the subpath to a particular file within the root dir. This + /// is retained to resolve locally-relative links. + Full(Url, String), +} + +impl SourceBaseInfo { + /// Constructs [`SourceBaseInfo::None`]. + pub fn no_info() -> Self { + Self::None + } + + /// Constructs [`SourceBaseInfo::Full`] with the given fields. + pub fn full_info(origin: Url, path: String) -> Self { + Self::Full(origin, path) + } + + /// Constructs a [`SourceBaseInfo`], with the variant being determined by the given URL. + /// + /// - A [`Url::cannot_be_a_base`] URL will yield [`SourceBaseInfo::None`]. + /// - A `file:` URL will yield [`SourceBaseInfo::NoRoot`]. + /// - For other URLs, a [`SourceBaseInfo::Full`] will be constructed from the URL's + /// origin and path. + pub fn from_source_url(url: &Url) -> Self { + // TODO: should we return error if a cannot_be_a_base is given? + if url.scheme() == "file" { + Self::NoRoot(url.clone()) + } else { + match Self::split_url_origin_and_path(url) { + None => Self::no_info(), + Some((origin, path)) => Self::full_info(origin, path), + } + } + } + + fn split_url_origin_and_path(url: &Url) -> Option<(Url, String)> { + let origin = url.join("/").ok()?; + let subpath = origin.make_relative(&url)?; + Some((origin, subpath)) + } + + /// If this is a [`SourceBaseInfo::NoRoot`], promote it to a [`SourceBaseInfo::Full`] + /// by using the filesystem root as the "origin" for root-relative links. + /// + /// Generally, this function should be avoided in favour of a more explicit + /// user-provided root directory. The filesystem root is rarely a good place + /// to look for files. + /// + /// Makes no change to other [`SourceBaseInfo`] variants. + pub fn use_fs_root_as_origin(self) -> Self { + let Self::NoRoot(url) = self else { return self }; + + let (fs_root, subpath) = Self::split_url_origin_and_path(&url) + .expect("splitting up a NoRoot file:// URL should work"); + + Self::full_info(fs_root, subpath) + } + + pub fn supports_root_relative(&self) -> bool { + matches!(self, Self::Full(_, _)) + } + + pub fn supports_locally_relative(&self) -> bool { + !matches!(self, Self::None) + } + + /// Returns the [`SourceBaseInfo`] which has _more information_ + /// between `self` and the given `fallback`. + /// + /// [`SourceBaseInfo::Full`] is preferred over [`SourceBaseInfo::NoRoot`] + /// which is preferred over [`SourceBaseInfo::None`]. If both `self` + /// and `fallback` are the same variant, then `self` will be preferred. + pub fn or_fallback(self, fallback: Self) -> Self { + match (self, fallback) { + (x @ Self::Full(_, _), _) => x, + (_, x @ Self::Full(_, _)) => x, + (x @ Self::NoRoot(_), _) => x, + (_, x @ Self::NoRoot(_)) => x, + (Self::None, Self::None) => Self::None, + } + } + + /// Returns whether the text represents a relative link that is + /// relative to the domain root. Textually, it looks like `/this`. + fn is_root_relative(text: &str) -> bool { + let text = text.trim_ascii_start(); + text.starts_with('/') && !text.starts_with("//") + } + + /// Parses the given URL text into a fully-qualified URL, including + /// resolving relative links if supported by the current [`SourceBaseInfo`]. + /// + /// # Errors + /// + /// Returns an error if the text is an invalid URL, or if the text is a + /// relative link and this [`SourceBaseInfo`] variant cannot resolve + /// the relative link. + pub fn parse_url_text(&self, text: &str, root_dir: Option<&Url>) -> Result { + // HACK: if root-dir is specified, apply it by fudging around with + // file:// URLs. also see bottom of this function. + let fake_base_info = match root_dir { + Some(_) => Cow::Owned(self.clone().use_fs_root_as_origin()), + None => Cow::Borrowed(self), + }; + + let url = match Uri::try_from(text.as_ref()) { + Ok(Uri { url }) => Ok(url), + Err(e @ ErrorKind::ParseUrl(_, _)) => match *fake_base_info { + Self::NoRoot(_) if Self::is_root_relative(text) => { + // TODO: report more errors if a --root-dir is specified but URL falls outside of + // thingy + Err(ErrorKind::InvalidBaseJoin(text.to_string())) + } + Self::NoRoot(ref base) => base + .join_rooted(&[&text]) + .map_err(|e| ErrorKind::ParseUrl(e, text.to_string())), + Self::Full(ref origin, ref subpath) => origin + .join_rooted(&[subpath, &text]) + .map_err(|e| ErrorKind::ParseUrl(e, text.to_string())), + Self::None => Err(e), + }, + Err(e) => Err(e), + }?; + + // if a root-relative link resulted in a file:// URL, then prefix + // this with root-dir. doing this after parsing prevents a `/../` + // link from traversing outside the root-dir. + if let Some(root_dir) = root_dir + && Self::is_root_relative(text) + && url.scheme() == "file" + { + let (_, subpath) = + Self::split_url_origin_and_path(&url).expect("file:// URL can be split"); + root_dir + .join(&subpath) + .map_err(|e| ErrorKind::ParseUrl(e, text.to_string())) + } else { + Ok(url) + } + } + + // Constructs a `SourceBaseInfo` from the given input source, root and base + // pair, and fallback base. + // + // # Arguments + // + // * `source` - The input source which contains the links we want to resolve. + // * `root_and_base` - An optional pair of root directory and base URL. The + // somewhat complicated type encodes the fact that if a [`Base`] is provided, + // then a [`Path`] must be provided too. If the base URL is omitted but root + // dir is provided, the base URL defaults to the root dir. + // * `fallback_base` - A fallback base URL to use where no other well-founded + // base URL can be derived. If it is applied, the fallback base URL is + // considered to be a well-founded base. + // + // # Root and base + // + // The given root and base URL are used to transform the intrinsic base returned + // by [`InputSource::to_url`]. If the intrinsic base is a subpath of the given + // root, then a new base is constructed by taking the intrinsic base and replacing + // the root dir with the given base URL. + // + // In this way, links from local files can be resolved *as if* they were hosted + // in a remote location at the base URL. Later, in [`SourceBaseInfo::parse_uri`], + // remote links which are subpaths of the base URL will be reflected back to + // local files within the root dir. + // + // # Well-founded bases + // + // Formally, a *well-founded* base is one which is derived from an input + // source which is *not* a local file, or one derived from a local file + // source which is a descendent of the given root dir. + // + // Informally, and importantly for using [`SourceBaseInfo`], a well-founded + // base is one where we can sensibly resolve root-relative links (i.e., + // relative links starting with `/`). + // + // # Errors + // + // This function fails with an [`Err`] if: + // - any of the provided arguments cannot be converted to a URL, or + // - [`SourceBaseInfo::new`] fails. +} + +/// Prepares the needed structures to resolve links within a particular input source, +/// while handling roots and bases. +/// +/// This should be called once for each [`ResolvedInputSource`] being processed. +/// The result of this function should be used with [`parse_url_with_base_info`] +/// to parse and resolve URLs. +/// +/// # Errors +/// +/// Returns an error if converting any of the given arguments to a URL fails +/// unexpectedly. +pub fn prepare_source_base_info( + source: &ResolvedInputSource, + root_and_base: Option<(&Path, Option<&Base>)>, + fallback_base: Option<&Base>, +) -> Result<(SourceBaseInfo, UrlMappings), ErrorKind> { + let root_and_base: Option<(Url, Url)> = match root_and_base { + // if root is specified but not base, use root dir as the base as well. + Some((root, base_option)) => { + let root = Base::Local(root.to_owned()).to_url()?; + let base = base_option.map_or_else(|| Ok(root.clone()), Base::to_url)?; + Some((root, base)) + } + None => None, + }; + + let fallback_base = match fallback_base.map(Base::to_url).transpose()? { + None => SourceBaseInfo::no_info(), + Some(fallback_url) => SourceBaseInfo::from_source_url(&fallback_url), + }; + + let mappings = UrlMappings::new(root_and_base.into_iter().collect())?; + + let base_info = match source.to_url()? { + Some(source_url) => match mappings.map_to_old_url(&source_url) { + Some((remote, subpath)) => SourceBaseInfo::full_info(remote.clone(), subpath), + None => SourceBaseInfo::from_source_url(&source_url), + }, + None => SourceBaseInfo::no_info(), + }; + + // NOTE: using fallback base in this way lets it override non-rooted + // file:// bases. + let base_info = base_info.or_fallback(fallback_base); + + Ok((base_info, mappings)) +} + +/// Parses and resolves the given URL text using the given base and mapping +/// information. +/// +/// # Errors +/// +/// Returns an error if the given text cannot be parsed as a URL, or if the +/// text parses as a relative URL and it cannot be resolved. +pub fn parse_url_with_base_info( + base_info: &SourceBaseInfo, + mappings: &UrlMappings, + text: &str, +) -> Result { + let url = base_info.parse_url_text(text, None)?; + + let mut url = match mappings.map_to_new_url(&url) { + Some((local, subpath)) => local.join(&subpath).ok(), + None => None, + } + .unwrap_or(url); + + // BACKWARDS COMPAT: delete trailing slash for file urls + if url.scheme() == "file" { + let _ = url + .path_segments_mut() + .as_mut() + .map(PathSegmentsMut::pop_if_empty); + } + + Ok(Uri { url }) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::num::NonZeroUsize; + use std::path::PathBuf; + + use crate::types::uri::raw::RawUriSpan; + + fn raw_uri(text: &'static str) -> RawUri { + RawUri { + text: text.to_string(), + element: None, + attribute: None, + span: RawUriSpan { + line: NonZeroUsize::MAX, + column: None, + }, + } + } + + // #[test] + // fn test_base_with_filename() { + // let root_dir = PathBuf::from("/some"); + // let base = Base::try_from("https://example.com/path/page2.html").unwrap(); + // let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html")); + // let base_info = + // SourceBaseInfo::from_source(&source, Some((&root_dir, Some(&base))), None).unwrap(); + // + // assert_eq!( + // base_info + // .parse_uri(&raw_uri("#fragment")) + // .as_ref() + // .map(|x| x.url.as_str()), + // Ok("file:///some/page.html#fragment") + // ); + // } + // + // #[test] + // fn test_base_with_same_filename() { + // let root_dir = PathBuf::from("/some/pagex.html"); + // let base = Base::try_from("https://example.com/path/page.html").unwrap(); + // let source = ResolvedInputSource::FsPath(PathBuf::from("/some/pagex.html")); + // let base_info = + // SourceBaseInfo::from_source(&source, Some((&root_dir, Some(&base))), None).unwrap(); + // + // assert_eq!( + // base_info + // .parse_uri(&raw_uri("#fragment")) + // .as_ref() + // .map(|x| x.url.as_str()), + // Ok("file:///some/pagex.html#fragment") + // ); + // } +} diff --git a/lychee-lib/src/types/input/source.rs b/lychee-lib/src/types/input/source.rs index 74dede26b5..dbf5342638 100644 --- a/lychee-lib/src/types/input/source.rs +++ b/lychee-lib/src/types/input/source.rs @@ -21,6 +21,7 @@ use reqwest::Url; use serde::{Deserialize, Deserializer, Serialize}; use std::borrow::Cow; use std::fmt::Display; +use std::ops::Deref; use std::path::PathBuf; use std::result::Result; @@ -158,6 +159,26 @@ pub enum ResolvedInputSource { String(Cow<'static, str>), } +impl ResolvedInputSource { + /// Converts an [`InputSource::RemoteUrl`] or [`InputSource::FsPath`] + /// to a [`Url`] pointing to the source. + /// + /// The outer result indicates whether the operation succeeded. + /// For `InputSource` variants which are not `RemoteUrl` or `FsPath`, + /// the operation will "succeed" with `None`. + pub fn to_url(&self) -> Result, ErrorKind> { + match self { + Self::RemoteUrl(url) => Ok(Some(url.deref().clone())), + Self::FsPath(path) => std::path::absolute(path) + .ok() + .and_then(|x| Url::from_file_path(x).ok()) + .ok_or_else(|| ErrorKind::InvalidUrlFromPath(path.to_owned())) + .map(Some), + _ => Ok(None), + } + } +} + impl From for InputSource { fn from(resolved: ResolvedInputSource) -> Self { match resolved { diff --git a/lychee-lib/src/types/mod.rs b/lychee-lib/src/types/mod.rs index c5cb9e5723..261bcbc9e3 100644 --- a/lychee-lib/src/types/mod.rs +++ b/lychee-lib/src/types/mod.rs @@ -2,6 +2,7 @@ mod accept; mod base; +pub(crate) mod base_info; mod basic_auth; mod cache; mod cookies; @@ -18,9 +19,11 @@ mod response; mod status; mod status_code_selector; pub(crate) mod uri; +pub(crate) mod url_mapping; pub use accept::*; pub use base::Base; +pub use base_info::SourceBaseInfo; pub use basic_auth::{BasicAuthCredentials, BasicAuthSelector}; pub use cache::CacheStatus; pub use cookies::CookieJar; @@ -34,6 +37,7 @@ pub use request_error::RequestError; pub use response::{Response, ResponseBody}; pub use status::Status; pub use status_code_selector::*; +pub use url_mapping::UrlMappings; /// The lychee `Result` type pub type Result = std::result::Result; diff --git a/lychee-lib/src/types/url_mapping.rs b/lychee-lib/src/types/url_mapping.rs new file mode 100644 index 0000000000..453cfa54fb --- /dev/null +++ b/lychee-lib/src/types/url_mapping.rs @@ -0,0 +1,74 @@ +//! Mapping of URLs based on prefix matches of the URL's path structure. +use crate::ErrorKind; +use crate::utils::url::ReqwestUrlExt; +use reqwest::Url; + +/// A collection of URL mappings which can be applied in either direction. +/// +/// Mappings are from URL to URL. A URL matches with a particular mapping +/// (and hence, the mapping will be applied) when the URL is a subpath +/// of the mapping source URL. Equivalently, this is when the URL has +/// a mapping's source URL as a prefix. +/// +/// Mappings are provided as pairs and the mapping can be interpreted in +/// either direction; the left URL can be mapped to the right, or +/// vice-versa. +/// +/// Despite this, we call the left side the "old URL" and the right side the +/// "new URL", since most uses will have _some_ level of directionality. +#[derive(Debug, PartialEq, Eq, Clone)] +pub struct UrlMappings { + /// List of tuples of `old_url`, `new_url`. + mappings: Vec<(Url, Url)>, +} + +impl UrlMappings { + /// Constructs a new [`UrlMappings`] from the given mappings. + /// + /// # Errors + /// + /// If any pair has a URL which is a subpath of its other URL. + pub fn new(mappings: Vec<(Url, Url)>) -> Result { + // TODO: check no repeated bases/roots on the same side. + let conflicting_mapping = mappings.iter().find(|(remote, local)| { + if remote == local { + false + } else { + remote.strictly_relative_to(local).is_some() + || local.strictly_relative_to(remote).is_some() + } + }); + + match conflicting_mapping { + Some((base, root)) => Err(ErrorKind::InvalidBase( + base.to_string(), + format!("base cannot be parent or child of root-dir {root}"), + )), + None => Ok(Self { mappings }), + } + } + + /// Matches the given URL against the old (left) URLs and + /// returns the new (right) URL of the first matched pair, if any. + /// + /// If matched, the returned option will contain a URL from the new + /// side of a mapping, along with the subpath of the given URL when + /// the corresponding old URL is removed from it. + pub fn map_to_new_url(&self, url: &Url) -> Option<(&Url, String)> { + // TODO: choose longest match if multiple could apply?? + self.mappings.iter().find_map(|(left, right)| { + url.strictly_relative_to(right) + .map(|subpath| (left, subpath)) + }) + } + + /// Like [`UrlMappings::map_to_new_url`] but in the reverse direction, + /// matching against the new URLs and returning the correponding + /// old URL of the matched mapping, if any. + pub fn map_to_old_url(&self, url: &Url) -> Option<(&Url, String)> { + self.mappings.iter().find_map(|(left, right)| { + url.strictly_relative_to(left) + .map(|subpath| (right, subpath)) + }) + } +} diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 8d64b2fd67..919054b854 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -1,14 +1,19 @@ +use log::warn; use percent_encoding::percent_decode_str; use reqwest::Url; +use std::borrow::Cow; use std::collections::HashSet; use std::path::{Path, PathBuf}; +use crate::types::SourceBaseInfo; +use crate::types::base_info; use crate::{ Base, BasicAuthCredentials, ErrorKind, LycheeResult, Request, RequestError, Uri, basic_auth::BasicAuthExtractor, types::{ResolvedInputSource, uri::raw::RawUri}, - utils::{path, url}, + utils::{path, url, url::ReqwestUrlExt}, }; +use ::url::ParseError; /// Extract basic auth credentials for a given URL. pub(crate) fn extract_credentials( @@ -22,11 +27,13 @@ pub(crate) fn extract_credentials( fn create_request( raw_uri: &RawUri, source: &ResolvedInputSource, - root_dir: Option<&PathBuf>, - base: Option<&Base>, + base_info: &SourceBaseInfo, extractor: Option<&BasicAuthExtractor>, ) -> LycheeResult { - let uri = try_parse_into_uri(raw_uri, source, root_dir, base)?; + // WARN: BROKEN because this needs to do all mapping. + let uri = Uri { + url: base_info.parse_url_text(&raw_uri.text, None)?, + }; let source = source.clone(); let element = raw_uri.element.clone(); let attribute = raw_uri.attribute.clone(); @@ -35,40 +42,23 @@ fn create_request( Ok(Request::new(uri, source, element, attribute, credentials)) } -/// Try to parse the raw URI into a `Uri`. -/// -/// If the raw URI is not a valid URI, create a URI by joining the base URL with the text. -/// If the base URL is not available, create a URI from the file path. -/// -/// # Errors -/// -/// - If the text (the unparsed URI represented as a `String`) cannot be joined with the base -/// to create a valid URI. -/// - If a URI cannot be created from the file path. -/// - If the source is not a file path (i.e. the URI type is not supported). +/// Shim to [`SourceBaseInfo`] for testing. This function is no longer +/// used by the main execution. fn try_parse_into_uri( raw_uri: &RawUri, source: &ResolvedInputSource, - root_dir: Option<&PathBuf>, + root_dir: Option<&Path>, base: Option<&Base>, ) -> LycheeResult { - let text = prepend_root_dir_if_absolute_local_link(&raw_uri.text, root_dir); - let uri = match Uri::try_from(raw_uri.clone()) { - Ok(uri) => uri, - Err(_) => match base { - Some(base_url) => match base_url.join(&text) { - Some(url) => Uri { url }, - None => return Err(ErrorKind::InvalidBaseJoin(text.clone())), - }, - None => match source { - ResolvedInputSource::FsPath(root) => { - create_uri_from_file_path(root, &text, root_dir.is_none())? - } - _ => return Err(ErrorKind::UnsupportedUriType(text)), - }, - }, - }; - Ok(uri) + // HACK: if only base_url is specified, use that as a fallback_base_url. + let (a, b) = match (root_dir, base) { + (None, base) => base_info::prepare_source_base_info(source, None, base), + (Some(root_dir), base) => { + base_info::prepare_source_base_info(source, Some((root_dir, base)), None) + } + }?; + + base_info::parse_url_with_base_info(&a, &b, &raw_uri.text) } // Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs @@ -118,21 +108,36 @@ fn create_uri_from_file_path( pub(crate) fn create( uris: Vec, source: &ResolvedInputSource, - root_dir: Option<&PathBuf>, - base: Option<&Base>, + root_and_base: Option<(&Path, Option<&Base>)>, + fallback_base: Option<&Base>, extractor: Option<&BasicAuthExtractor>, ) -> Vec> { - let base = base.cloned().or_else(|| Base::from_source(source)); + // TODO: it would probably be nice to inline prepare_source_base_info into this function. + // however, it uses a lot of `.?` and we need to catch and handle all those errors here. + let (base_info, mappings) = + match base_info::prepare_source_base_info(source, root_and_base, fallback_base) { + Ok(base_info) => base_info, + Err(e) => { + // TODO: IMPORTANT! return an error inside this vec. + warn!("Error handling source {source}: {e:?}"); + return vec![]; + } + }; let mut requests = HashSet::::new(); let mut errors = Vec::::new(); for raw_uri in uris { - let result = create_request(&raw_uri, source, root_dir, base.as_ref(), extractor); - match result { - Ok(request) => { - requests.insert(request); + match base_info::parse_url_with_base_info(&base_info, &mappings, &raw_uri.text) { + Ok(uri) => { + let source = source.clone(); + let element = raw_uri.element.clone(); + let attribute = raw_uri.attribute.clone(); + let credentials = extract_credentials(extractor, &uri); + + requests.insert(Request::new(uri, source, element, attribute, credentials)); } + Err(e) => errors.push(RequestError::CreateRequestItem( raw_uri.clone(), source.clone(), @@ -184,7 +189,7 @@ fn resolve_and_create_url( Ok(url) } -fn prepend_root_dir_if_absolute_local_link(text: &str, root_dir: Option<&PathBuf>) -> String { +fn prepend_root_dir_if_absolute_local_link(text: &str, root_dir: Option<&Path>) -> String { if text.starts_with('/') && let Some(path) = root_dir && let Some(path_str) = path.to_str() @@ -212,11 +217,11 @@ mod tests { fn create_ok_only( uris: Vec, source: &ResolvedInputSource, - root_dir: Option<&PathBuf>, - base: Option<&Base>, + root_and_base: Option<(&Path, Option<&Base>)>, + fallback_base: Option<&Base>, extractor: Option<&BasicAuthExtractor>, ) -> Vec { - create(uris, source, root_dir, base, extractor) + create(uris, source, root_and_base, fallback_base, extractor) .into_iter() .filter_map(Result::ok) .collect() @@ -333,7 +338,7 @@ mod tests { let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html")); let uris = vec![raw_uri("relative.html")]; - let requests = create_ok_only(uris, &source, Some(&root_dir), None, None); + let requests = create_ok_only(uris, &source, Some((&root_dir, None)), None, None); assert_eq!(requests.len(), 1); assert!( @@ -349,7 +354,7 @@ mod tests { let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html")); let uris = vec![raw_uri("https://another.com/page")]; - let requests = create_ok_only(uris, &source, Some(&root_dir), None, None); + let requests = create_ok_only(uris, &source, Some((&root_dir, None)), None, None); assert_eq!(requests.len(), 1); assert!( @@ -362,10 +367,10 @@ mod tests { #[test] fn test_root_relative_url_resolution_from_root_dir() { let root_dir = PathBuf::from("/tmp/lychee"); - let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html")); + let source = ResolvedInputSource::FsPath(PathBuf::from("/tmp/lychee/page.html")); let uris = vec![raw_uri("/root-relative")]; - let requests = create_ok_only(uris, &source, Some(&root_dir), None, None); + let requests = create_ok_only(uris, &source, Some((&root_dir, None)), None, None); assert_eq!(requests.len(), 1); assert!( @@ -381,7 +386,7 @@ mod tests { let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html")); let uris = vec![raw_uri("../parent")]; - let requests = create_ok_only(uris, &source, Some(&root_dir), None, None); + let requests = create_ok_only(uris, &source, Some((&root_dir, None)), None, None); assert_eq!(requests.len(), 1); assert!( @@ -397,7 +402,7 @@ mod tests { let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html")); let uris = vec![raw_uri("#fragment")]; - let requests = create_ok_only(uris, &source, Some(&root_dir), None, None); + let requests = create_ok_only(uris, &source, Some((&root_dir, None)), None, None); assert_eq!(requests.len(), 1); assert!( @@ -411,16 +416,16 @@ mod tests { fn test_relative_url_resolution_from_root_dir_and_base_url() { let root_dir = PathBuf::from("/tmp/lychee"); let base = Base::try_from("https://example.com/path/page.html").unwrap(); - let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html")); + let source = ResolvedInputSource::FsPath(PathBuf::from("/tmp/lychee/localpage.html")); let uris = vec![raw_uri("relative.html")]; - let requests = create_ok_only(uris, &source, Some(&root_dir), Some(&base), None); + let requests = create_ok_only(uris, &source, Some((&root_dir, Some(&base))), None, None); assert_eq!(requests.len(), 1); assert!( requests .iter() - .any(|r| r.uri.url.as_str() == "https://example.com/path/relative.html") + .any(|r| r.uri.url.as_str() == "file:///tmp/lychee/relative.html") ); } @@ -431,8 +436,9 @@ mod tests { let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html")); let uris = vec![raw_uri("https://another.com/page")]; - let requests = create_ok_only(uris, &source, Some(&root_dir), Some(&base), None); + let requests = create_ok_only(uris, &source, Some((&root_dir, Some(&base))), None, None); + println!("{:?}", requests); assert_eq!(requests.len(), 1); assert!( requests @@ -445,16 +451,17 @@ mod tests { fn test_root_relative_url_resolution_from_root_dir_and_base_url() { let root_dir = PathBuf::from("/tmp/lychee"); let base = Base::try_from("https://example.com/path/page.html").unwrap(); - let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html")); + let source = ResolvedInputSource::FsPath(PathBuf::from("/tmp/lychee/localpage.html")); let uris = vec![raw_uri("/root-relative")]; - let requests = create_ok_only(uris, &source, Some(&root_dir), Some(&base), None); + let requests = create_ok_only(uris, &source, Some((&root_dir, Some(&base))), None, None); + println!("{:?}", requests); assert_eq!(requests.len(), 1); assert!( requests .iter() - .any(|r| r.uri.url.as_str() == "https://example.com/tmp/lychee/root-relative") + .any(|r| r.uri.url.as_str() == "https://example.com/root-relative") ); } @@ -465,30 +472,30 @@ mod tests { let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html")); let uris = vec![raw_uri("../parent")]; - let requests = create_ok_only(uris, &source, Some(&root_dir), Some(&base), None); + let requests = create_ok_only(uris, &source, Some((&root_dir, Some(&base))), None, None); assert_eq!(requests.len(), 1); assert!( requests .iter() - .any(|r| r.uri.url.as_str() == "https://example.com/parent") + .any(|r| r.uri.url.as_str() == "file:///parent") ); } #[test] fn test_fragment_url_resolution_from_root_dir_and_base_url() { - let root_dir = PathBuf::from("/tmp/lychee"); - let base = Base::try_from("https://example.com/path/page.html").unwrap(); + let root_dir = PathBuf::from("/some"); + let base = Base::try_from("https://example.com/path/").unwrap(); let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html")); let uris = vec![raw_uri("#fragment")]; - let requests = create_ok_only(uris, &source, Some(&root_dir), Some(&base), None); + let requests = create_ok_only(uris, &source, Some((&root_dir, Some(&base))), None, None); assert_eq!(requests.len(), 1); assert!( requests .iter() - .any(|r| r.uri.url.as_str() == "https://example.com/path/page.html#fragment") + .any(|r| r.uri.url.as_str() == "file:///some/page.html#fragment") ); } @@ -511,15 +518,10 @@ mod tests { fn test_create_request_from_relative_file_path() { let base = Base::Local(PathBuf::from("/tmp/lychee")); let input_source = ResolvedInputSource::FsPath(PathBuf::from("page.html")); + let base_info = SourceBaseInfo::from_source(&input_source, None, Some(&base)).unwrap(); - let actual = create_request( - &raw_uri("file.html"), - &input_source, - None, - Some(&base), - None, - ) - .unwrap(); + let actual = + create_request(&raw_uri("file.html"), &input_source, &base_info, None).unwrap(); assert_eq!( actual, @@ -542,20 +544,19 @@ mod tests { create_request( &raw_uri("file.html"), &ResolvedInputSource::Stdin, - None, - None, + &SourceBaseInfo::from_source(&ResolvedInputSource::Stdin, None, None).unwrap(), None, ) .is_err() ); // error because no root-dir and no base-url + let src = ResolvedInputSource::FsPath(PathBuf::from("page.html")); assert!( create_request( &raw_uri("/file.html"), - &ResolvedInputSource::FsPath(PathBuf::from("page.html")), - None, - None, + &src, + &SourceBaseInfo::from_source(&src, None, None).unwrap(), None, ) .is_err() @@ -564,15 +565,15 @@ mod tests { #[test] fn test_create_request_from_absolute_file_path() { - let base = Base::Local(PathBuf::from("/tmp/lychee")); + let base = Base::Local(PathBuf::from("/")); let input_source = ResolvedInputSource::FsPath(PathBuf::from("/tmp/lychee/page.html")); + let base_info = SourceBaseInfo::from_source(&input_source, None, Some(&base)).unwrap(); // Use an absolute path that's outside the base directory let actual = create_request( &raw_uri("/usr/local/share/doc/example.html"), &input_source, - None, - Some(&base), + &base_info, None, ) .unwrap(); diff --git a/lychee-lib/src/utils/url.rs b/lychee-lib/src/utils/url.rs index dc850c6fb8..35a7cbf8e3 100644 --- a/lychee-lib/src/utils/url.rs +++ b/lychee-lib/src/utils/url.rs @@ -1,6 +1,9 @@ +use std::borrow::Cow; use std::sync::LazyLock; use linkify::LinkFinder; +use reqwest::Url; +use url::ParseError; static LINK_FINDER: LazyLock = LazyLock::new(LinkFinder::new); @@ -23,6 +26,262 @@ pub(crate) fn find_links(input: &str) -> impl Iterator> LINK_FINDER.links(input) } +pub(crate) trait ReqwestUrlExt { + fn strictly_relative_to(&self, prefix: &Url) -> Option; + fn join_rooted(&self, subpaths: &[&str]) -> Result; +} + +impl ReqwestUrlExt for Url { + fn strictly_relative_to(&self, prefix: &Url) -> Option { + if self.scheme() != prefix.scheme() + || self.authority() != prefix.authority() + || self.port() != prefix.port() + { + return None; + } + + let prefix_has_filename = prefix.path_segments()?.last().is_some_and(|x| x != ""); + + let relative = if prefix_has_filename { + if self.path() == prefix.path() { + Some(String::new()) + } else { + None + } + } else { + let mut prefix_segments = prefix.path_segments()?.peekable(); + let mut url_segments = self.path_segments()?.peekable(); + + // discard "" entry from the end of the prefix + let _ = prefix_segments.next_back(); + + while let Some(s1) = prefix_segments.peek() + && let Some(s2) = url_segments.peek() + && s1 == s2 + { + let _ = prefix_segments.next(); + let _ = url_segments.next(); + } + + let remaining_prefix = prefix_segments.collect::>(); + let remaining_url = url_segments.collect::>(); + + println!("{:?}", remaining_prefix); + println!("{:?}", remaining_url); + + let relative = match (&remaining_prefix[..], &remaining_url[..]) { + // if nothing is remaining in URL, then we have prefix=/a/, url=/a. + // this should NOT be considered a match. + ([], []) => None, + + ([], rest) => Some(rest.join("/")), + + _ => None, + }; + + relative.map(|x| { + if x.starts_with("/") { + format!(".{x}") + } else { + x + } + }) + }; + + println!("x={:?}", relative); + + relative.map(|mut relative| { + if let Some(query) = self.query() { + relative.push('?'); + relative.push_str(query); + } + + if let Some(fragment) = self.fragment() { + relative.push('#'); + relative.push_str(fragment); + } + relative + }) + + // prefix + // .make_relative(self) + // .filter(|subpath| !subpath.starts_with("../") && !subpath.starts_with('/')) + // .inspect(|x| println!("subpathing {}", x)) + // .filter(|_| prefix.as_str().starts_with(self.as_str())) + } + + fn join_rooted(&self, subpaths: &[&str]) -> Result { + let base = self; + // println!("applying {}, {}, {}", base, subpath, link); + // tests: + // - .. out of local base should be blocked. + // - scheme-relative urls should work and not spuriously trigger base url + // - fully-qualified urls should work + // - slash should work to go to local base, if specified + // - slash should be forbidden for inferred base urls. + // - percent encoding ;-; + // - trailing slashes in base-url and/or root-dir + // - fragments and query params, on both http and file + // - windows file paths ;-; + let fake_base = match base.scheme() { + "file" => { + let mut fake_base = base.join("/")?; + fake_base.set_host(Some("secret-lychee-base-url.invalid"))?; + Some(fake_base) + } + _ => None, + }; + + let mut url = Cow::Borrowed(fake_base.as_ref().unwrap_or(base)); + for subpath in subpaths { + url = Cow::Owned(url.join(subpath)?); + } + + match fake_base.as_ref().and_then(|b| b.make_relative(&url)) { + Some(relative_to_base) => base.join(&relative_to_base), + None => Ok(url.into_owned()), + } + // .inspect(|x| println!("---> {x}")) + } +} + +#[cfg(test)] +mod test_url_ext { + use super::*; + + macro_rules! url { + ($x: expr) => { + Url::parse($x).unwrap() + }; + } + + #[test] + fn test_strictly_relative_to() { + // note trailing slashes for subpaths, otherwise everything becomes siblings + let goog = Url::parse("https://goog.com").unwrap(); + let goog_subpath = goog.join("subpath/").unwrap(); + let goog_subsubpath = goog_subpath.join("sub2path/").unwrap(); + + assert_eq!(goog.strictly_relative_to(&goog).as_deref(), Some("")); + + assert_eq!( + goog_subpath.strictly_relative_to(&goog).as_deref(), + Some("subpath/") + ); + assert_eq!(goog.strictly_relative_to(&goog_subpath).as_deref(), None); + + assert_eq!( + goog_subpath + .strictly_relative_to(&goog_subsubpath) + .as_deref(), + None + ); + } + + #[test] + fn test_fdsa() { + // exact match + assert_eq!( + url!("https://a.com/b/x") + .strictly_relative_to(&url!("https://a.com/b/x")) + .as_deref(), + Some("") + ); + assert_eq!( + url!("https://a.com/b/") + .strictly_relative_to(&url!("https://a.com/b/")) + .as_deref(), + Some("") + ); + assert_eq!( + url!("https://a.com/b/x?a=2") + .strictly_relative_to(&url!("https://a.com/b/x?b=x")) + .as_deref(), + Some("?a=2") + ); + + // no matches due to / difference + assert_eq!( + url!("https://a.com/b") + .strictly_relative_to(&url!("https://a.com/b/")) + .as_deref(), + None + ); + assert_eq!( + url!("https://a.com/b/") + .strictly_relative_to(&url!("https://a.com/b")) + .as_deref(), + None + ); + + // changing filename leads to no match + assert_eq!( + url!("https://a.com/b/x") + .strictly_relative_to(&url!("https://a.com/b/aa")) + .as_deref(), + None + ); + + // matching in subdir + assert_eq!( + url!("https://a.com/b/x") + .strictly_relative_to(&url!("https://a.com/b/")) + .as_deref(), + Some("x") + ); + + // no match + assert_eq!( + url!("https://a.com/b/x") + .strictly_relative_to(&url!("https://a.com/b")) + .as_deref(), + None + ); + assert_eq!( + url!("https://a.com/b/x") + .strictly_relative_to(&url!("https://a.com/a")) + .as_deref(), + None + ); + assert_eq!( + url!("https://a.com/b/x") + .strictly_relative_to(&url!("https://a.com/a/")) + .as_deref(), + None + ); + + // matches and maintains extra ./ inside url. + assert_eq!( + url!("https://a.com/b//x") + .strictly_relative_to(&url!("https://a.com/b/")) + .as_deref(), + Some("./x") + ); + assert_eq!( + url!("https://a.com/b///x") + .strictly_relative_to(&url!("https://a.com/b/")) + .as_deref(), + Some(".//x") + ); + + println!( + "{:?}", + url!("https://a.com/b//x") + .path_segments() + .unwrap() + .collect::>() + ); + println!( + "{:?}", + url!("https://a.com/b/") + .path_segments() + .unwrap() + .collect::>() + ); + panic!(); + } +} + #[cfg(test)] mod test_fs_tree { use super::*; diff --git a/lychee.example.toml b/lychee.example.toml index 967031ae92..916c115a90 100644 --- a/lychee.example.toml +++ b/lychee.example.toml @@ -117,6 +117,9 @@ base_url = "https://example.com" # Root path to use when checking absolute local links, must be an absolute path root_dir = "/dist" +# Fallback base URL to use for input sources with no base URL +fallback_base_url = "https://example.com/fallback.html" + # HTTP basic auth support. This will be the username and password passed to the # authorization HTTP header. See #