From e4239a789d30d747ba1530a8107d9a587ba95667 Mon Sep 17 00:00:00 2001 From: rina Date: Sun, 31 Aug 2025 19:56:37 +1000 Subject: [PATCH 01/59] ihii --- lychee-lib/src/types/base.rs | 8 ++++ lychee-lib/src/utils/request.rs | 80 ++++++++++++++++++++++++--------- 2 files changed, 67 insertions(+), 21 deletions(-) diff --git a/lychee-lib/src/types/base.rs b/lychee-lib/src/types/base.rs index 4c68900c18..41d31aed7a 100644 --- a/lychee-lib/src/types/base.rs +++ b/lychee-lib/src/types/base.rs @@ -30,6 +30,13 @@ impl Base { } } + pub(crate) fn to_url(&self) -> Option { + match self { + Self::Remote(url) => Some(url.clone()), + Self::Local(path) => Url::from_file_path(path).ok(), + } + } + pub(crate) fn from_source(source: &InputSource) -> Option { match &source { InputSource::RemoteUrl(url) => { @@ -42,6 +49,7 @@ impl Base { // We keep the username and password intact Some(Base::Remote(*base_url)) } + InputSource::FsPath(path) => path.to_path_buf().canonicalize().ok().map(Base::Local), // other inputs do not have a URL to extract a base _ => None, } diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 6b57b9ee8c..a342a4aaa6 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -1,6 +1,7 @@ use log::warn; use percent_encoding::percent_decode_str; use reqwest::Url; +use std::borrow::Cow; use std::{ collections::HashSet, path::{Path, PathBuf}, @@ -56,22 +57,61 @@ fn try_parse_into_uri( base: Option<&Base>, ) -> Result { let text = prepend_root_dir_if_absolute_local_link(&raw_uri.text, root_dir); - let uri = match Uri::try_from(raw_uri.clone()) { - Ok(uri) => uri, - Err(_) => match base { - Some(base_url) => match base_url.join(&text) { - Some(url) => Uri { url }, - None => return Err(ErrorKind::InvalidBaseJoin(text.clone())), - }, - None => match source { - InputSource::FsPath(root) => { - create_uri_from_file_path(root, &text, root_dir.is_none())? + + let base = base + .map(Cow::Borrowed) + .or_else(|| root_dir.map(|root| Cow::Owned(Base::Local(root.clone())))); + + // 1. graft input source - if source is a FsPath subdirectory of root_dir, + // replace InputSource with RemoteUrl. + + // 2. map root_dir to base. + + match (base, root_dir) { + (Some(remote_base), Some(root_dir)) => { + let source_base = match Base::from_source(source) { + Some(Base::Local(local_base)) => { + println!("{local_base:?}"); + match local_base.strip_prefix(root_dir) { + Ok(subpath) => { + let subpath = subpath.to_string_lossy(); + Base::Remote(remote_base.to_url().unwrap().join(&subpath).unwrap()) + } + Err(_) => Base::Local(local_base), + } + .into() } - _ => return Err(ErrorKind::UnsupportedUriType(text)), - }, - }, - }; - Ok(uri) + x => x, + }; + + let base2 = source_base.as_ref().and_then(Base::to_url); + + let ads = reqwest::Url::options() + .base_url(base2.as_ref()) + .parse(&raw_uri.text) + .map_err(|e| ErrorKind::ParseUrl(e, raw_uri.text.clone())); + let x = ads?; + println!("{:?}", x.as_str()); + + // TODO: MAP BACK TO local root dir by checking if ads starts with base. + + // let uri = match Uri::try_from(raw_uri.clone()) { + // Ok(uri) => uri, + // Err(_) => match base { + // Some(base_url) => match base_url.join(&text) { + // Some(url) => Uri { url }, + // None => return Err(ErrorKind::InvalidBaseJoin(text.clone())), + // }, + // None => panic!("no base :((((("), + // }, + // }; + // println!(" = {uri:?}"); + Ok(Uri { url: x }) + } + _ => panic!("fdjsiao"), + } + + // let base = base.and_then(Base::to_url); } // Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs @@ -140,18 +180,16 @@ pub(crate) fn create( base: Option<&Base>, extractor: Option<&BasicAuthExtractor>, ) -> HashSet { - let base = base.cloned().or_else(|| Base::from_source(source)); - uris.into_iter() - .filter_map(|raw_uri| { - match create_request(&raw_uri, source, root_dir, base.as_ref(), extractor) { + .filter_map( + |raw_uri| match create_request(&raw_uri, source, root_dir, base, extractor) { Ok(request) => Some(request), Err(e) => { warn!("Error creating request: {e:?}"); None } - } - }) + }, + ) .collect() } From 3ed59a04a02919714693a3e0f85af2290841890f Mon Sep 17 00:00:00 2001 From: rina Date: Sun, 31 Aug 2025 22:04:17 +1000 Subject: [PATCH 02/59] as foreseen, we have big problems with absolute-rooted file urls --- lychee-lib/src/collector.rs | 2 +- lychee-lib/src/types/input/source.rs | 15 +++++++ lychee-lib/src/utils/request.rs | 60 ++++++++++++++++++---------- 3 files changed, 55 insertions(+), 22 deletions(-) diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 791a548761..2b3c35ae0b 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -292,7 +292,7 @@ impl Collector { let requests = request::create( uris, &content.source, - root_dir.as_ref(), + root_dir.as_deref(), base.as_ref(), basic_auth_extractor.as_ref(), ); diff --git a/lychee-lib/src/types/input/source.rs b/lychee-lib/src/types/input/source.rs index 6abcea098b..164feb17c9 100644 --- a/lychee-lib/src/types/input/source.rs +++ b/lychee-lib/src/types/input/source.rs @@ -40,6 +40,21 @@ pub enum InputSource { String(String), } +impl InputSource { + /// Converts a [`Self::RemoteUrl`] or [`Self::FsPath`] to a + /// [`Url`], if possible. + /// + /// Returns `None` if the [`InputSource`] is not these cases, + /// of if the `FsPath` is not a valid URL. + pub fn to_url(&self) -> Option { + match self { + Self::RemoteUrl(url) => Some(*url.clone()), + Self::FsPath(path) => Url::from_file_path(path.canonicalize().ok()?).ok(), + _ => None, + } + } +} + /// Resolved input sources that can be processed for content. /// /// This represents input sources after glob pattern expansion. diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index a342a4aaa6..0495685279 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -2,6 +2,7 @@ use log::warn; use percent_encoding::percent_decode_str; use reqwest::Url; use std::borrow::Cow; +use std::ops::Deref; use std::{ collections::HashSet, path::{Path, PathBuf}, @@ -26,7 +27,7 @@ pub(crate) fn extract_credentials( fn create_request( raw_uri: &RawUri, source: &InputSource, - root_dir: Option<&PathBuf>, + root_dir: Option<&Path>, base: Option<&Base>, extractor: Option<&BasicAuthExtractor>, ) -> Result { @@ -53,41 +54,58 @@ fn create_request( fn try_parse_into_uri( raw_uri: &RawUri, source: &InputSource, - root_dir: Option<&PathBuf>, + root_dir: Option<&Path>, base: Option<&Base>, ) -> Result { let text = prepend_root_dir_if_absolute_local_link(&raw_uri.text, root_dir); let base = base - .map(Cow::Borrowed) - .or_else(|| root_dir.map(|root| Cow::Owned(Base::Local(root.clone())))); + .and_then(Base::to_url) + .or_else(|| root_dir.and_then(|root| Url::from_file_path(root).ok())); + + println!("{:?}", base.clone().unwrap().join("not rooted")); + println!("{:?}", base.clone().unwrap().join("/rooted")); // 1. graft input source - if source is a FsPath subdirectory of root_dir, // replace InputSource with RemoteUrl. // 2. map root_dir to base. + // let root_dir_url = root_dir + // .map(Path::to_string_lossy) + // .map(|x| Url::from_file_path(&*x).expect("file path to url failed?!")); + // let root_dir = root_dir_url.as_ref().map(Url::as_str); + match (base, root_dir) { (Some(remote_base), Some(root_dir)) => { - let source_base = match Base::from_source(source) { - Some(Base::Local(local_base)) => { - println!("{local_base:?}"); - match local_base.strip_prefix(root_dir) { - Ok(subpath) => { - let subpath = subpath.to_string_lossy(); - Base::Remote(remote_base.to_url().unwrap().join(&subpath).unwrap()) - } - Err(_) => Base::Local(local_base), - } - .into() - } - x => x, + println!("{:?}", remote_base.join("/rooted file ")); + let source_base = match source { + InputSource::RemoteUrl(url) => Some(Cow::Borrowed(url.deref())), + InputSource::FsPath(path) => match path.canonicalize() { + Ok(path) => path + .strip_prefix(&*root_dir) + .ok() + .map(|subpath| { + // let subpath = subpath.strip_prefix(subpath.join("/")).unwrap_or(subpath); + println!("subpath = {:?}", subpath); + remote_base.join(&subpath.to_string_lossy()).expect("joining failed?!") + }) + .map(Cow::Owned) + .or_else(|| { + Some(Cow::Owned( + Url::from_file_path(path).expect("path to url failed?"), + )) + }), + Err(_) => None, + }, + _ => None, }; - let base2 = source_base.as_ref().and_then(Base::to_url); + let base2 = source_base; + println!("base = {:?}, uri = {:?}", base2.as_deref(), &raw_uri.text); let ads = reqwest::Url::options() - .base_url(base2.as_ref()) + .base_url(base2.as_deref()) .parse(&raw_uri.text) .map_err(|e| ErrorKind::ParseUrl(e, raw_uri.text.clone())); let x = ads?; @@ -176,7 +194,7 @@ fn truncate_source(source: &InputSource) -> InputSource { pub(crate) fn create( uris: Vec, source: &InputSource, - root_dir: Option<&PathBuf>, + root_dir: Option<&Path>, base: Option<&Base>, extractor: Option<&BasicAuthExtractor>, ) -> HashSet { @@ -231,7 +249,7 @@ fn resolve_and_create_url( Ok(url) } -fn prepend_root_dir_if_absolute_local_link(text: &str, root_dir: Option<&PathBuf>) -> String { +fn prepend_root_dir_if_absolute_local_link(text: &str, root_dir: Option<&Path>) -> String { if text.starts_with('/') { if let Some(path) = root_dir { if let Some(path_str) = path.to_str() { From 84974ce830312c5fbb27ab989d01df3fa7c42136 Mon Sep 17 00:00:00 2001 From: rina Date: Sun, 31 Aug 2025 22:13:06 +1000 Subject: [PATCH 03/59] blah --- lychee-lib/src/utils/request.rs | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 0495685279..55a103d7ea 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -59,6 +59,7 @@ fn try_parse_into_uri( ) -> Result { let text = prepend_root_dir_if_absolute_local_link(&raw_uri.text, root_dir); + println!("{:?}", base.clone().unwrap().join("/rooted")); let base = base .and_then(Base::to_url) .or_else(|| root_dir.and_then(|root| Url::from_file_path(root).ok())); @@ -88,7 +89,9 @@ fn try_parse_into_uri( .map(|subpath| { // let subpath = subpath.strip_prefix(subpath.join("/")).unwrap_or(subpath); println!("subpath = {:?}", subpath); - remote_base.join(&subpath.to_string_lossy()).expect("joining failed?!") + remote_base + .join(&subpath.to_string_lossy()) + .expect("joining failed?!") }) .map(Cow::Owned) .or_else(|| { @@ -104,10 +107,15 @@ fn try_parse_into_uri( let base2 = source_base; println!("base = {:?}, uri = {:?}", base2.as_deref(), &raw_uri.text); - let ads = reqwest::Url::options() - .base_url(base2.as_deref()) - .parse(&raw_uri.text) - .map_err(|e| ErrorKind::ParseUrl(e, raw_uri.text.clone())); + let ads = match base2.as_deref() { + // Some(base) if base.scheme() == "file" => { + // + // } + _ => reqwest::Url::options() + .base_url(base2.as_deref()) + .parse(&raw_uri.text) + .map_err(|e| ErrorKind::ParseUrl(e, raw_uri.text.clone())), + }; let x = ads?; println!("{:?}", x.as_str()); From 2c10447de72af7563e8c0ab7fafffc29a6e1bde5 Mon Sep 17 00:00:00 2001 From: rina Date: Mon, 1 Sep 2025 00:10:38 +1000 Subject: [PATCH 04/59] secret-lychee-local-base-url --- lychee-lib/src/utils/request.rs | 47 +++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 55a103d7ea..c6cfb8de63 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -15,6 +15,8 @@ use crate::{ utils::{path, url}, }; +use ::url::ParseError; + /// Extract basic auth credentials for a given URL. pub(crate) fn extract_credentials( extractor: Option<&BasicAuthExtractor>, @@ -40,6 +42,23 @@ fn create_request( Ok(Request::new(uri, source, element, attribute, credentials)) } +fn apply_base(input: &str, base: Option<&Url>) -> std::result::Result { + let secret_local_base = + reqwest::Url::parse("ftp://secret-lychee-local-base-url.internal/").unwrap(); + + let fake_base = base.clone().map(|base| match base { + base if base.scheme() == "file" => &secret_local_base, + base => base, + }); + + let url = reqwest::Url::options().base_url(fake_base).parse(input)?; + + match secret_local_base.make_relative(&url) { + Some(subpath) => base.unwrap().join(&subpath), + None => Ok(url), + } +} + /// Try to parse the raw URI into a `Uri`. /// /// If the raw URI is not a valid URI, create a URI by joining the base URL with the text. @@ -59,13 +78,14 @@ fn try_parse_into_uri( ) -> Result { let text = prepend_root_dir_if_absolute_local_link(&raw_uri.text, root_dir); - println!("{:?}", base.clone().unwrap().join("/rooted")); + // println!("{:?}", base.clone().unwrap().join("/rooted")); let base = base .and_then(Base::to_url) .or_else(|| root_dir.and_then(|root| Url::from_file_path(root).ok())); + println!("{:?}", apply_base(&raw_uri.text, base.as_ref())); - println!("{:?}", base.clone().unwrap().join("not rooted")); - println!("{:?}", base.clone().unwrap().join("/rooted")); + // println!("{:?}", base.clone().unwrap().join("not rooted")); + // println!("{:?}", base.clone().unwrap().join("/rooted")); // 1. graft input source - if source is a FsPath subdirectory of root_dir, // replace InputSource with RemoteUrl. @@ -79,7 +99,7 @@ fn try_parse_into_uri( match (base, root_dir) { (Some(remote_base), Some(root_dir)) => { - println!("{:?}", remote_base.join("/rooted file ")); + // println!("{:?}", remote_base.join("/rooted file ")); let source_base = match source { InputSource::RemoteUrl(url) => Some(Cow::Borrowed(url.deref())), InputSource::FsPath(path) => match path.canonicalize() { @@ -91,7 +111,7 @@ fn try_parse_into_uri( println!("subpath = {:?}", subpath); remote_base .join(&subpath.to_string_lossy()) - .expect("joining failed?!") + .expect("joining onto base url failed?!") }) .map(Cow::Owned) .or_else(|| { @@ -105,19 +125,12 @@ fn try_parse_into_uri( }; let base2 = source_base; - println!("base = {:?}, uri = {:?}", base2.as_deref(), &raw_uri.text); - - let ads = match base2.as_deref() { - // Some(base) if base.scheme() == "file" => { - // - // } - _ => reqwest::Url::options() - .base_url(base2.as_deref()) - .parse(&raw_uri.text) - .map_err(|e| ErrorKind::ParseUrl(e, raw_uri.text.clone())), - }; + // println!("base = {:?}, uri = {:?}", base2.as_deref(), &raw_uri.text); + + let ads = apply_base(&raw_uri.text, base2.as_deref()) + .map_err(|e| ErrorKind::ParseUrl(e, raw_uri.text.clone())); let x = ads?; - println!("{:?}", x.as_str()); + // println!("{:?}", x.as_str()); // TODO: MAP BACK TO local root dir by checking if ads starts with base. From 542d9cdc99d5c8d23bbc666f278d1f5eed1c126e Mon Sep 17 00:00:00 2001 From: rina Date: Mon, 1 Sep 2025 00:20:45 +1000 Subject: [PATCH 05/59] lazy init --- lychee-lib/src/utils/request.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index c6cfb8de63..121cacc68d 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -3,6 +3,7 @@ use percent_encoding::percent_decode_str; use reqwest::Url; use std::borrow::Cow; use std::ops::Deref; +use std::sync::LazyLock; use std::{ collections::HashSet, path::{Path, PathBuf}, @@ -42,18 +43,18 @@ fn create_request( Ok(Request::new(uri, source, element, attribute, credentials)) } -fn apply_base(input: &str, base: Option<&Url>) -> std::result::Result { - let secret_local_base = - reqwest::Url::parse("ftp://secret-lychee-local-base-url.internal/").unwrap(); +static FAKE_BASE_URL: LazyLock = + LazyLock::new(|| reqwest::Url::parse("ftp://secret-lychee-local-base-url.invalid/").unwrap()); +fn apply_base(input: &str, base: Option<&Url>) -> std::result::Result { let fake_base = base.clone().map(|base| match base { - base if base.scheme() == "file" => &secret_local_base, + base if base.scheme() == "file" => &*FAKE_BASE_URL, base => base, }); let url = reqwest::Url::options().base_url(fake_base).parse(input)?; - match secret_local_base.make_relative(&url) { + match FAKE_BASE_URL.make_relative(&url) { Some(subpath) => base.unwrap().join(&subpath), None => Ok(url), } From af11fc693c01927e57630590fa12a4ece475a7e7 Mon Sep 17 00:00:00 2001 From: rina Date: Mon, 1 Sep 2025 00:27:44 +1000 Subject: [PATCH 06/59] absolute --- lychee-lib/src/utils/request.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 121cacc68d..92763ede1d 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -103,7 +103,7 @@ fn try_parse_into_uri( // println!("{:?}", remote_base.join("/rooted file ")); let source_base = match source { InputSource::RemoteUrl(url) => Some(Cow::Borrowed(url.deref())), - InputSource::FsPath(path) => match path.canonicalize() { + InputSource::FsPath(path) => match std::path::absolute(path) { Ok(path) => path .strip_prefix(&*root_dir) .ok() From 4fd79999d3afd12fb072389d4b12c8f81f2ef8f0 Mon Sep 17 00:00:00 2001 From: rina Date: Mon, 1 Sep 2025 12:37:39 +1000 Subject: [PATCH 07/59] blah --- lychee-lib/src/utils/request.rs | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 92763ede1d..83d6abfba1 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -99,10 +99,10 @@ fn try_parse_into_uri( // let root_dir = root_dir_url.as_ref().map(Url::as_str); match (base, root_dir) { - (Some(remote_base), Some(root_dir)) => { + (Some(ref remote_base), Some(root_dir)) => { // println!("{:?}", remote_base.join("/rooted file ")); - let source_base = match source { - InputSource::RemoteUrl(url) => Some(Cow::Borrowed(url.deref())), + let thingy = match source { + InputSource::RemoteUrl(url) => Some((Cow::Borrowed(url.deref()), Cow::Borrowed(""), true)), InputSource::FsPath(path) => match std::path::absolute(path) { Ok(path) => path .strip_prefix(&*root_dir) @@ -110,20 +110,17 @@ fn try_parse_into_uri( .map(|subpath| { // let subpath = subpath.strip_prefix(subpath.join("/")).unwrap_or(subpath); println!("subpath = {:?}", subpath); - remote_base - .join(&subpath.to_string_lossy()) - .expect("joining onto base url failed?!") + (Cow::Borrowed(remote_base), subpath.to_string_lossy(), true) }) - .map(Cow::Owned) .or_else(|| { - Some(Cow::Owned( - Url::from_file_path(path).expect("path to url failed?"), - )) + Some((Cow::Owned( + Url::from_file_path(&path).expect("path to url failed?")), Cow::Borrowed(""), false)) }), Err(_) => None, }, _ => None, }; + let source_base = thingy.map(|x| x.0); let base2 = source_base; // println!("base = {:?}, uri = {:?}", base2.as_deref(), &raw_uri.text); From c288197013f40bc4c93e2a2d4a3a9fe6183eff4a Mon Sep 17 00:00:00 2001 From: rina Date: Mon, 1 Sep 2025 18:18:09 +1000 Subject: [PATCH 08/59] blahblahblah. transpose + back-substitution --- lychee-lib/src/collector.rs | 1 + lychee-lib/src/types/base.rs | 7 +- lychee-lib/src/utils/request.rs | 158 ++++++++++++++++++++------------ 3 files changed, 105 insertions(+), 61 deletions(-) diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 2b3c35ae0b..1b79937e11 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -289,6 +289,7 @@ impl Collector { async move { let content = content?; let uris: Vec = extractor.extract(&content); + println!("{:?}", &uris); let requests = request::create( uris, &content.source, diff --git a/lychee-lib/src/types/base.rs b/lychee-lib/src/types/base.rs index 41d31aed7a..b266f25805 100644 --- a/lychee-lib/src/types/base.rs +++ b/lychee-lib/src/types/base.rs @@ -30,10 +30,11 @@ impl Base { } } - pub(crate) fn to_url(&self) -> Option { + pub(crate) fn to_url(&self) -> Result { match self { - Self::Remote(url) => Some(url.clone()), - Self::Local(path) => Url::from_file_path(path).ok(), + Self::Remote(url) => Ok(url.clone()), + Self::Local(path) => Url::from_file_path(path) + .map_err(|()| ErrorKind::InvalidUrlFromPath(path.to_owned())), } } diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 83d6abfba1..60016b87b2 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -46,16 +46,32 @@ fn create_request( static FAKE_BASE_URL: LazyLock = LazyLock::new(|| reqwest::Url::parse("ftp://secret-lychee-local-base-url.invalid/").unwrap()); -fn apply_base(input: &str, base: Option<&Url>) -> std::result::Result { - let fake_base = base.clone().map(|base| match base { - base if base.scheme() == "file" => &*FAKE_BASE_URL, - base => base, - }); +fn apply_base(base: &Url, subpath: &str, link: &str) -> std::result::Result { + // tests: + // - .. out of local base should be blocked. + // - scheme-relative urls should work and not spuriously trigger base url + // - fully-qualified urls should work + // - slash should work to go to local base, if specified + // - slash should be forbidden for inferred base urls. + // - percent encoding ;-; + // - trailing slashes in base-url and/or root-dir + let fake_base = match base.scheme() { + "file" => { + let mut fake_base = base.join("/")?; + fake_base.set_host(Some("secret-lychee-local-base-url.invalid"))?; + Some(fake_base) + } + _ => None, + }; - let url = reqwest::Url::options().base_url(fake_base).parse(input)?; + let url = fake_base + .as_ref() + .unwrap_or(base) + .join(subpath)? + .join(link)?; - match FAKE_BASE_URL.make_relative(&url) { - Some(subpath) => base.unwrap().join(&subpath), + match fake_base.and_then(|b| b.make_relative(&url)) { + Some(relative_to_base) => base.join(&relative_to_base), None => Ok(url), } } @@ -79,11 +95,21 @@ fn try_parse_into_uri( ) -> Result { let text = prepend_root_dir_if_absolute_local_link(&raw_uri.text, root_dir); + let root_dir_url = match root_dir { + Some(path) => Url::from_directory_path(path) + .map_err(|()| ErrorKind::InvalidUrlFromPath(path.to_owned()))? + .into(), + None => None, + }; + // println!("{:?}", base.clone().unwrap().join("/rooted")); - let base = base - .and_then(Base::to_url) - .or_else(|| root_dir.and_then(|root| Url::from_file_path(root).ok())); - println!("{:?}", apply_base(&raw_uri.text, base.as_ref())); + let base: Option = base + .map(Cow::Borrowed) + .or_else(|| root_dir.map(|d| Base::Local(d.to_owned())).map(Cow::Owned)) + .as_ref() + .map(|b| b.to_url()) + .transpose()?; + // println!("{:?}", apply_base(&raw_uri.text, base.as_ref())); // println!("{:?}", base.clone().unwrap().join("not rooted")); // println!("{:?}", base.clone().unwrap().join("/rooted")); @@ -97,56 +123,72 @@ fn try_parse_into_uri( // .map(Path::to_string_lossy) // .map(|x| Url::from_file_path(&*x).expect("file path to url failed?!")); // let root_dir = root_dir_url.as_ref().map(Url::as_str); + // - match (base, root_dir) { - (Some(ref remote_base), Some(root_dir)) => { - // println!("{:?}", remote_base.join("/rooted file ")); - let thingy = match source { - InputSource::RemoteUrl(url) => Some((Cow::Borrowed(url.deref()), Cow::Borrowed(""), true)), - InputSource::FsPath(path) => match std::path::absolute(path) { - Ok(path) => path - .strip_prefix(&*root_dir) - .ok() - .map(|subpath| { - // let subpath = subpath.strip_prefix(subpath.join("/")).unwrap_or(subpath); - println!("subpath = {:?}", subpath); - (Cow::Borrowed(remote_base), subpath.to_string_lossy(), true) - }) - .or_else(|| { - Some((Cow::Owned( - Url::from_file_path(&path).expect("path to url failed?")), Cow::Borrowed(""), false)) - }), - Err(_) => None, - }, + let fallback_local_base = |path: &Path| match Url::from_file_path(path) { + Ok(path_url) => { + let top = path_url.join("/").unwrap(); + let subpath = top.make_relative(&path_url).unwrap(); + Ok(move || (Cow::Owned(top), Cow::Owned(subpath), false)) + } + Err(()) => Err(ErrorKind::InvalidUrlFromPath(path.to_owned())), + }; + + // println!("{:?}", remote_base.join("/rooted file ")); + let base_info: Option<(Cow, Cow, bool)> = match source { + InputSource::RemoteUrl(url) => Some((Cow::Borrowed(url.deref()), Cow::Borrowed(""), true)), + InputSource::FsPath(path) => match std::path::absolute(path) { + Ok(path) => match (&base, &root_dir) { + (Some(base), Some(root_dir)) => path.strip_prefix(root_dir).ok().map(|subpath| { + ( + Cow::Borrowed(base), + Cow::Owned(subpath.to_string_lossy().into()), + true, + ) + }), _ => None, - }; - let source_base = thingy.map(|x| x.0); - - let base2 = source_base; - // println!("base = {:?}, uri = {:?}", base2.as_deref(), &raw_uri.text); - - let ads = apply_base(&raw_uri.text, base2.as_deref()) - .map_err(|e| ErrorKind::ParseUrl(e, raw_uri.text.clone())); - let x = ads?; - // println!("{:?}", x.as_str()); - - // TODO: MAP BACK TO local root dir by checking if ads starts with base. - - // let uri = match Uri::try_from(raw_uri.clone()) { - // Ok(uri) => uri, - // Err(_) => match base { - // Some(base_url) => match base_url.join(&text) { - // Some(url) => Uri { url }, - // None => return Err(ErrorKind::InvalidBaseJoin(text.clone())), - // }, - // None => panic!("no base :((((("), - // }, - // }; - // println!(" = {uri:?}"); - Ok(Uri { url: x }) + } + .unwrap_or_else(fallback_local_base(&path)?) + .into(), + Err(_) => None, + }, + _ => None, + }; + println!("{} {:?}", &raw_uri.text, &base_info); + + match base_info { + Some((_, _, false)) if raw_uri.text.trim_ascii_start().starts_with("/") => { + Url::parse(&raw_uri.text) // this is expected to fail. + } + Some((base, subpath, _)) => { + apply_base(&base, &subpath, &raw_uri.text).and_then(|url| { + match (base.make_relative(&url), &root_dir_url) { + (Some(base_url_subpath), Some(root_dir_url)) => { + root_dir_url.join(&base_url_subpath) + } + _ => Ok(url), + } + }) } - _ => panic!("fdjsiao"), + None => Url::parse(&raw_uri.text), } + .inspect(|x| println!("-----> {}", x)) + .map(|url| Uri { url }) + .map_err(|e| ErrorKind::ParseUrl(e, raw_uri.text.clone())) + + // TODO: MAP BACK TO local root dir by checking if ads starts with base. + + // let uri = match Uri::try_from(raw_uri.clone()) { + // Ok(uri) => uri, + // Err(_) => match base { + // Some(base_url) => match base_url.join(&text) { + // Some(url) => Uri { url }, + // None => return Err(ErrorKind::InvalidBaseJoin(text.clone())), + // }, + // None => panic!("no base :((((("), + // }, + // }; + // println!(" = {uri:?}"); // let base = base.and_then(Base::to_url); } From 5cd3750f591005ac491c7ecfb121ab156423aaa6 Mon Sep 17 00:00:00 2001 From: rina Date: Mon, 1 Sep 2025 18:43:35 +1000 Subject: [PATCH 09/59] fix '..' being remapped too eagerly --- lychee-lib/src/utils/request.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 60016b87b2..4b6744726f 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -47,6 +47,7 @@ static FAKE_BASE_URL: LazyLock = LazyLock::new(|| reqwest::Url::parse("ftp://secret-lychee-local-base-url.invalid/").unwrap()); fn apply_base(base: &Url, subpath: &str, link: &str) -> std::result::Result { + println!("applying {}, {}, {}", base, subpath, link); // tests: // - .. out of local base should be blocked. // - scheme-relative urls should work and not spuriously trigger base url @@ -74,6 +75,7 @@ fn apply_base(base: &Url, subpath: &str, link: &str) -> std::result::Result base.join(&relative_to_base), None => Ok(url), } + .inspect(|x| println!("= {}", x)) } /// Try to parse the raw URI into a `Uri`. @@ -102,7 +104,7 @@ fn try_parse_into_uri( None => None, }; - // println!("{:?}", base.clone().unwrap().join("/rooted")); + // println!("{:?}", base.clone()); let base: Option = base .map(Cow::Borrowed) .or_else(|| root_dir.map(|d| Base::Local(d.to_owned())).map(Cow::Owned)) @@ -154,7 +156,7 @@ fn try_parse_into_uri( }, _ => None, }; - println!("{} {:?}", &raw_uri.text, &base_info); + // println!("{} {:?}", &raw_uri.text, &base_info); match base_info { Some((_, _, false)) if raw_uri.text.trim_ascii_start().starts_with("/") => { @@ -163,7 +165,7 @@ fn try_parse_into_uri( Some((base, subpath, _)) => { apply_base(&base, &subpath, &raw_uri.text).and_then(|url| { match (base.make_relative(&url), &root_dir_url) { - (Some(base_url_subpath), Some(root_dir_url)) => { + (Some(base_url_subpath), Some(root_dir_url)) if !base_url_subpath.starts_with("..") => { root_dir_url.join(&base_url_subpath) } _ => Ok(url), From 9f1c1e45ba88cdc63f010aba60e7627a0b7a0c35 Mon Sep 17 00:00:00 2001 From: rina Date: Mon, 1 Sep 2025 20:30:03 +1000 Subject: [PATCH 10/59] stash --- lychee-lib/src/types/base.rs | 6 ++++-- lychee-lib/src/types/input/source.rs | 12 +++++------- lychee-lib/src/utils/request.rs | 24 ++++++++++++++---------- lychee-lib/src/utils/reqwest.rs | 8 ++++++++ 4 files changed, 31 insertions(+), 19 deletions(-) diff --git a/lychee-lib/src/types/base.rs b/lychee-lib/src/types/base.rs index b266f25805..1111f7b9cc 100644 --- a/lychee-lib/src/types/base.rs +++ b/lychee-lib/src/types/base.rs @@ -33,8 +33,10 @@ impl Base { pub(crate) fn to_url(&self) -> Result { match self { Self::Remote(url) => Ok(url.clone()), - Self::Local(path) => Url::from_file_path(path) - .map_err(|()| ErrorKind::InvalidUrlFromPath(path.to_owned())), + Self::Local(path) => std::path::absolute(path) + .ok() + .and_then(|x| Url::from_directory_path(x).ok()) + .ok_or_else(|| ErrorKind::InvalidUrlFromPath(path.to_owned())), } } diff --git a/lychee-lib/src/types/input/source.rs b/lychee-lib/src/types/input/source.rs index 164feb17c9..04d6835c35 100644 --- a/lychee-lib/src/types/input/source.rs +++ b/lychee-lib/src/types/input/source.rs @@ -14,6 +14,7 @@ //! and filtered by extension //! - URLs, raw strings, and standard input (`stdin`) are read directly +use crate::Base; use reqwest::Url; use serde::{Deserialize, Serialize}; use std::fmt::Display; @@ -42,14 +43,11 @@ pub enum InputSource { impl InputSource { /// Converts a [`Self::RemoteUrl`] or [`Self::FsPath`] to a - /// [`Url`], if possible. - /// - /// Returns `None` if the [`InputSource`] is not these cases, - /// of if the `FsPath` is not a valid URL. - pub fn to_url(&self) -> Option { + /// [`Base`]. Returns `None` for other `InputSource` variants. + pub fn to_base(&self) -> Option { match self { - Self::RemoteUrl(url) => Some(*url.clone()), - Self::FsPath(path) => Url::from_file_path(path.canonicalize().ok()?).ok(), + Self::RemoteUrl(url) => Some(Base::Remote(*url.clone())), + Self::FsPath(path) => Some(Base::Local(path.clone())), _ => None, } } diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 4b6744726f..fe01eed28a 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -75,7 +75,6 @@ fn apply_base(base: &Url, subpath: &str, link: &str) -> std::result::Result base.join(&relative_to_base), None => Ok(url), } - .inspect(|x| println!("= {}", x)) } /// Try to parse the raw URI into a `Uri`. @@ -127,17 +126,20 @@ fn try_parse_into_uri( // let root_dir = root_dir_url.as_ref().map(Url::as_str); // - let fallback_local_base = |path: &Path| match Url::from_file_path(path) { - Ok(path_url) => { - let top = path_url.join("/").unwrap(); - let subpath = top.make_relative(&path_url).unwrap(); - Ok(move || (Cow::Owned(top), Cow::Owned(subpath), false)) - } - Err(()) => Err(ErrorKind::InvalidUrlFromPath(path.to_owned())), + let fallback_local_base = |url: &Url, allow_absolute: bool| { + let top = path_url.join("/").unwrap(); + let subpath = top.make_relative(&path_url).unwrap(); + move || (Cow::Owned(top), Cow::Owned(subpath), allow_absolute) }; + let source_base = source.to_base(); + let source_url = source_base.as_ref().map(Base::to_url).transpose()?; + // println!("{:?}", remote_base.join("/rooted file ")); - let base_info: Option<(Cow, Cow, bool)> = match source { + let base_info = source_url.map(|url| match url.strip_prefix(root_dir) { + Some(subpath) => (Cow::Borrowed(base), Cow::Owned(subpath), true) + None => fallback_local_base(url, true) + }); InputSource::RemoteUrl(url) => Some((Cow::Borrowed(url.deref()), Cow::Borrowed(""), true)), InputSource::FsPath(path) => match std::path::absolute(path) { Ok(path) => match (&base, &root_dir) { @@ -165,7 +167,9 @@ fn try_parse_into_uri( Some((base, subpath, _)) => { apply_base(&base, &subpath, &raw_uri.text).and_then(|url| { match (base.make_relative(&url), &root_dir_url) { - (Some(base_url_subpath), Some(root_dir_url)) if !base_url_subpath.starts_with("..") => { + (Some(base_url_subpath), Some(root_dir_url)) + if !base_url_subpath.starts_with("..") => + { root_dir_url.join(&base_url_subpath) } _ => Ok(url), diff --git a/lychee-lib/src/utils/reqwest.rs b/lychee-lib/src/utils/reqwest.rs index 8c54203e52..5e538aa68b 100644 --- a/lychee-lib/src/utils/reqwest.rs +++ b/lychee-lib/src/utils/reqwest.rs @@ -1,5 +1,13 @@ use std::error::Error; +impl ReqwestUrlExt for reqwest::Url { + + fn strip_prefix(&self, prefix: &reqwest::Url) -> Option { + prefix.make_relative(self).filter(|subpath| !subpath.starts_with("../")) + } + +} + /// A rule for matching error message patterns to human-readable messages struct ErrorRule { patterns: &'static [&'static str], From aee0ed1a001ebf5dfe8dd8fbe07c829b495a6b81 Mon Sep 17 00:00:00 2001 From: rina Date: Mon, 1 Sep 2025 22:56:57 +1000 Subject: [PATCH 11/59] refactor the thingy into separate mods --- lychee-lib/src/types/input/source.rs | 23 ++++-- lychee-lib/src/utils/request.rs | 104 ++++++++++++--------------- lychee-lib/src/utils/reqwest.rs | 10 ++- 3 files changed, 70 insertions(+), 67 deletions(-) diff --git a/lychee-lib/src/types/input/source.rs b/lychee-lib/src/types/input/source.rs index 04d6835c35..0fd050f17a 100644 --- a/lychee-lib/src/types/input/source.rs +++ b/lychee-lib/src/types/input/source.rs @@ -14,10 +14,11 @@ //! and filtered by extension //! - URLs, raw strings, and standard input (`stdin`) are read directly -use crate::Base; +use crate::{Base, ErrorKind}; use reqwest::Url; use serde::{Deserialize, Serialize}; use std::fmt::Display; +use std::ops::Deref; use std::path::PathBuf; /// Input types which lychee supports @@ -42,13 +43,21 @@ pub enum InputSource { } impl InputSource { - /// Converts a [`Self::RemoteUrl`] or [`Self::FsPath`] to a - /// [`Base`]. Returns `None` for other `InputSource` variants. - pub fn to_base(&self) -> Option { + /// Converts an [`InputSource::RemoteUrl`] or [`InputSource::FsPath`] + /// to a [`Url`] pointing to the source. + /// + /// The outer result indicates whether the operation succeeded. + /// For `InputSource` variants which are not `RemoteUrl` or `FsPath`, + /// the operation will "succeed" with `None`. + pub fn to_url(&self) -> Result, ErrorKind> { match self { - Self::RemoteUrl(url) => Some(Base::Remote(*url.clone())), - Self::FsPath(path) => Some(Base::Local(path.clone())), - _ => None, + Self::RemoteUrl(url) => Ok(Some(url.deref().clone())), + Self::FsPath(path) => std::path::absolute(path) + .ok() + .and_then(|x| Url::from_file_path(x).ok()) + .ok_or_else(|| ErrorKind::InvalidUrlFromPath(path.to_owned())) + .map(Some), + _ => Ok(None), } } } diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index fe01eed28a..01c6e4d3b9 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -13,9 +13,8 @@ use crate::{ Base, BasicAuthCredentials, ErrorKind, Request, Result, Uri, basic_auth::BasicAuthExtractor, types::{InputSource, uri::raw::RawUri}, - utils::{path, url}, + utils::{path, reqwest::ReqwestUrlExt, url}, }; - use ::url::ParseError; /// Extract basic auth credentials for a given URL. @@ -43,11 +42,8 @@ fn create_request( Ok(Request::new(uri, source, element, attribute, credentials)) } -static FAKE_BASE_URL: LazyLock = - LazyLock::new(|| reqwest::Url::parse("ftp://secret-lychee-local-base-url.invalid/").unwrap()); - fn apply_base(base: &Url, subpath: &str, link: &str) -> std::result::Result { - println!("applying {}, {}, {}", base, subpath, link); + // println!("applying {}, {}, {}", base, subpath, link); // tests: // - .. out of local base should be blocked. // - scheme-relative urls should work and not spuriously trigger base url @@ -56,10 +52,11 @@ fn apply_base(base: &Url, subpath: &str, link: &str) -> std::result::Result { let mut fake_base = base.join("/")?; - fake_base.set_host(Some("secret-lychee-local-base-url.invalid"))?; + fake_base.set_host(Some("secret-lychee-base-url.invalid"))?; Some(fake_base) } _ => None, @@ -75,6 +72,7 @@ fn apply_base(base: &Url, subpath: &str, link: &str) -> std::result::Result base.join(&relative_to_base), None => Ok(url), } + .inspect(|x| println!("---> {}", x)) } /// Try to parse the raw URI into a `Uri`. @@ -96,20 +94,19 @@ fn try_parse_into_uri( ) -> Result { let text = prepend_root_dir_if_absolute_local_link(&raw_uri.text, root_dir); - let root_dir_url = match root_dir { - Some(path) => Url::from_directory_path(path) - .map_err(|()| ErrorKind::InvalidUrlFromPath(path.to_owned()))? - .into(), - None => None, - }; + let root_dir_url = root_dir + .map(|path| { + Url::from_directory_path(path) + .map_err(|()| ErrorKind::InvalidUrlFromPath(path.to_owned())) + }) + .transpose()?; // println!("{:?}", base.clone()); - let base: Option = base - .map(Cow::Borrowed) - .or_else(|| root_dir.map(|d| Base::Local(d.to_owned())).map(Cow::Owned)) - .as_ref() - .map(|b| b.to_url()) - .transpose()?; + let base_url: Option> = base + .map(Base::to_url) + .transpose()? + .map(Cow::Owned) + .or(root_dir_url.as_ref().map(Cow::Borrowed)); // println!("{:?}", apply_base(&raw_uri.text, base.as_ref())); // println!("{:?}", base.clone().unwrap().join("not rooted")); @@ -126,61 +123,54 @@ fn try_parse_into_uri( // let root_dir = root_dir_url.as_ref().map(Url::as_str); // - let fallback_local_base = |url: &Url, allow_absolute: bool| { - let top = path_url.join("/").unwrap(); - let subpath = top.make_relative(&path_url).unwrap(); - move || (Cow::Owned(top), Cow::Owned(subpath), allow_absolute) + let fallback_local_base = |url: &Url| -> Option<_> { + let top = url.join("/").ok()?; + let subpath = top.make_relative(&url)?; + Some((Cow::Owned(top), Cow::Owned(subpath), url.scheme() != "file")) }; - let source_base = source.to_base(); - let source_url = source_base.as_ref().map(Base::to_url).transpose()?; - - // println!("{:?}", remote_base.join("/rooted file ")); - let base_info = source_url.map(|url| match url.strip_prefix(root_dir) { - Some(subpath) => (Cow::Borrowed(base), Cow::Owned(subpath), true) - None => fallback_local_base(url, true) - }); - InputSource::RemoteUrl(url) => Some((Cow::Borrowed(url.deref()), Cow::Borrowed(""), true)), - InputSource::FsPath(path) => match std::path::absolute(path) { - Ok(path) => match (&base, &root_dir) { - (Some(base), Some(root_dir)) => path.strip_prefix(root_dir).ok().map(|subpath| { - ( - Cow::Borrowed(base), - Cow::Owned(subpath.to_string_lossy().into()), - true, - ) - }), - _ => None, - } - .unwrap_or_else(fallback_local_base(&path)?) - .into(), - Err(_) => None, - }, - _ => None, + let source_url = source.to_url()?; + + let base_info = match &source_url { + Some(url) => match (base_url.as_deref(), &root_dir_url) { + (Some(base_url), Some(root_dir_url)) => url + .strip_prefix(root_dir_url) + .map(|subpath| (Cow::Borrowed(base_url), Cow::Owned(subpath), true)), + _ => None, + } + .map_or_else( + || fallback_local_base(url).ok_or(ErrorKind::InvalidUrlHost), + Ok, + )? + .into(), + None => None, }; - // println!("{} {:?}", &raw_uri.text, &base_info); + println!("{} {:?}", &raw_uri.text, &base_info); + + // match Uri::try_from(raw_uri.clone()) { + // Ok(uri) => return Ok(uri), + // _ => (), + // }; match base_info { Some((_, _, false)) if raw_uri.text.trim_ascii_start().starts_with("/") => { - Url::parse(&raw_uri.text) // this is expected to fail. + Err(ParseError::RelativeUrlWithoutBase) } Some((base, subpath, _)) => { apply_base(&base, &subpath, &raw_uri.text).and_then(|url| { - match (base.make_relative(&url), &root_dir_url) { - (Some(base_url_subpath), Some(root_dir_url)) - if !base_url_subpath.starts_with("..") => - { + match (url.strip_prefix(&base), &root_dir_url) { + (Some(base_url_subpath), Some(root_dir_url)) => { root_dir_url.join(&base_url_subpath) } _ => Ok(url), } }) } - None => Url::parse(&raw_uri.text), + None => Url::parse(&raw_uri.text) } - .inspect(|x| println!("-----> {}", x)) - .map(|url| Uri { url }) + // .inspect(|x| println!("-----> {}", x)) .map_err(|e| ErrorKind::ParseUrl(e, raw_uri.text.clone())) + .map(|url| Uri { url }) // TODO: MAP BACK TO local root dir by checking if ads starts with base. diff --git a/lychee-lib/src/utils/reqwest.rs b/lychee-lib/src/utils/reqwest.rs index 5e538aa68b..3daad67fbf 100644 --- a/lychee-lib/src/utils/reqwest.rs +++ b/lychee-lib/src/utils/reqwest.rs @@ -1,11 +1,15 @@ use std::error::Error; -impl ReqwestUrlExt for reqwest::Url { +pub(crate) trait ReqwestUrlExt { + fn strip_prefix(&self, prefix: &reqwest::Url) -> Option; +} +impl ReqwestUrlExt for reqwest::Url { fn strip_prefix(&self, prefix: &reqwest::Url) -> Option { - prefix.make_relative(self).filter(|subpath| !subpath.starts_with("../")) + prefix + .make_relative(self) + .filter(|subpath| !subpath.starts_with("../")) } - } /// A rule for matching error message patterns to human-readable messages From eebc3d45030db2634542371ec47b26fee9dc6788 Mon Sep 17 00:00:00 2001 From: rina Date: Mon, 1 Sep 2025 23:16:04 +1000 Subject: [PATCH 12/59] use base::local for from_directory_path --- lychee-lib/src/utils/request.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 01c6e4d3b9..2c0e268834 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -95,10 +95,7 @@ fn try_parse_into_uri( let text = prepend_root_dir_if_absolute_local_link(&raw_uri.text, root_dir); let root_dir_url = root_dir - .map(|path| { - Url::from_directory_path(path) - .map_err(|()| ErrorKind::InvalidUrlFromPath(path.to_owned())) - }) + .map(|path| Base::Local(path.to_owned()).to_url()) .transpose()?; // println!("{:?}", base.clone()); From e55766f065ede6b16f23a23d85bd26fd29199b93 Mon Sep 17 00:00:00 2001 From: rina Date: Mon, 1 Sep 2025 23:33:17 +1000 Subject: [PATCH 13/59] touching --- lychee-lib/src/utils/request.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 2c0e268834..a9cb2dbeb1 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -120,7 +120,7 @@ fn try_parse_into_uri( // let root_dir = root_dir_url.as_ref().map(Url::as_str); // - let fallback_local_base = |url: &Url| -> Option<_> { + let infer_source_base = |url: &Url| -> Option<_> { let top = url.join("/").ok()?; let subpath = top.make_relative(&url)?; Some((Cow::Owned(top), Cow::Owned(subpath), url.scheme() != "file")) @@ -129,14 +129,14 @@ fn try_parse_into_uri( let source_url = source.to_url()?; let base_info = match &source_url { - Some(url) => match (base_url.as_deref(), &root_dir_url) { - (Some(base_url), Some(root_dir_url)) => url + Some(source_url) => match (base_url.as_deref(), &root_dir_url) { + (Some(base_url), Some(root_dir_url)) => source_url .strip_prefix(root_dir_url) .map(|subpath| (Cow::Borrowed(base_url), Cow::Owned(subpath), true)), _ => None, } .map_or_else( - || fallback_local_base(url).ok_or(ErrorKind::InvalidUrlHost), + || infer_source_base(source_url).ok_or(ErrorKind::InvalidUrlHost), Ok, )? .into(), @@ -163,7 +163,7 @@ fn try_parse_into_uri( } }) } - None => Url::parse(&raw_uri.text) + None => Url::parse(&raw_uri.text), } // .inspect(|x| println!("-----> {}", x)) .map_err(|e| ErrorKind::ParseUrl(e, raw_uri.text.clone())) From 1c7af899ee7c18a6eb6691b0f3d9d2606921bc0b Mon Sep 17 00:00:00 2001 From: rina Date: Tue, 2 Sep 2025 00:17:07 +1000 Subject: [PATCH 14/59] TODO: hoist all this source-dependent computation maybe we can make it work if we get rid of Cow. if it's only run once per input, then that should be fast enough --- lychee-lib/src/checker/file.rs | 32 ++++++++++++++++---------------- lychee-lib/src/utils/request.rs | 20 +++++++++----------- lychee-lib/src/utils/reqwest.rs | 2 ++ 3 files changed, 27 insertions(+), 27 deletions(-) diff --git a/lychee-lib/src/checker/file.rs b/lychee-lib/src/checker/file.rs index b313c98340..a22d9ff02e 100644 --- a/lychee-lib/src/checker/file.rs +++ b/lychee-lib/src/checker/file.rs @@ -96,22 +96,22 @@ impl FileChecker { /// Returns the resolved path as a `PathBuf`, or the original path /// if no base path is defined. fn resolve_base(&self, path: &Path) -> PathBuf { - if let Some(Base::Local(base_path)) = &self.base { - if path.is_absolute() { - let absolute_base_path = if base_path.is_relative() { - std::env::current_dir().unwrap_or_default().join(base_path) - } else { - base_path.clone() - }; - - let stripped = path.strip_prefix("/").unwrap_or(path); - absolute_base_path.join(stripped) - } else { - base_path.join(path) - } - } else { - path.to_path_buf() - } + // if let Some(Base::Local(base_path)) = &self.base { + // if path.is_absolute() { + // let absolute_base_path = if base_path.is_relative() { + // std::env::current_dir().unwrap_or_default().join(base_path) + // } else { + // base_path.clone() + // }; + // + // let stripped = path.strip_prefix("/").unwrap_or(path); + // absolute_base_path.join(stripped) + // } else { + // base_path.join(path) + // } + // } else { + path.to_path_buf() + // } } /// Resolves the given local path by applying logic which is specific to local file diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index a9cb2dbeb1..d95fe2aef5 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -153,19 +153,17 @@ fn try_parse_into_uri( Some((_, _, false)) if raw_uri.text.trim_ascii_start().starts_with("/") => { Err(ParseError::RelativeUrlWithoutBase) } - Some((base, subpath, _)) => { - apply_base(&base, &subpath, &raw_uri.text).and_then(|url| { - match (url.strip_prefix(&base), &root_dir_url) { - (Some(base_url_subpath), Some(root_dir_url)) => { - root_dir_url.join(&base_url_subpath) - } - _ => Ok(url), - } - }) - } + Some((base, subpath, _allow_absolute)) => apply_base(&base, &subpath, &raw_uri.text) + .and_then(|url| match (base_url.as_deref(), &root_dir_url) { + (Some(base_url), Some(root_dir_url)) => url + .strip_prefix(&base_url) + .and_then(|subpath| root_dir_url.join(&subpath).ok()) + .map_or(Ok(url), Ok), + _ => Ok(url), + }), None => Url::parse(&raw_uri.text), } - // .inspect(|x| println!("-----> {}", x)) + .inspect(|x| println!("OUT -----> {}", x)) .map_err(|e| ErrorKind::ParseUrl(e, raw_uri.text.clone())) .map(|url| Uri { url }) diff --git a/lychee-lib/src/utils/reqwest.rs b/lychee-lib/src/utils/reqwest.rs index 3daad67fbf..23d9bdfeed 100644 --- a/lychee-lib/src/utils/reqwest.rs +++ b/lychee-lib/src/utils/reqwest.rs @@ -9,6 +9,8 @@ impl ReqwestUrlExt for reqwest::Url { prefix .make_relative(self) .filter(|subpath| !subpath.starts_with("../")) + // .inspect(|x| println!("subpathing {}", x)) + // .filter(|_| prefix.as_str().starts_with(self.as_str())) } } From 1bbc28d990ffbd3b7c49fc610ea929e7f665bf01 Mon Sep 17 00:00:00 2001 From: rina Date: Sat, 6 Sep 2025 23:12:16 +1000 Subject: [PATCH 15/59] sourcebaseinfo --- lychee-lib/src/types/mod.rs | 2 + lychee-lib/src/types/source_base_info.rs | 100 +++++++++++++++++++++++ lychee-lib/src/utils/request.rs | 78 +++++++----------- lychee-lib/src/utils/url.rs | 37 +++++++++ 4 files changed, 169 insertions(+), 48 deletions(-) create mode 100644 lychee-lib/src/types/source_base_info.rs diff --git a/lychee-lib/src/types/mod.rs b/lychee-lib/src/types/mod.rs index 79bfdd5fa8..dc245dec1b 100644 --- a/lychee-lib/src/types/mod.rs +++ b/lychee-lib/src/types/mod.rs @@ -12,6 +12,7 @@ pub(crate) mod mail; mod request; pub(crate) mod resolver; mod response; +mod source_base_info; mod status; mod status_code; pub(crate) mod uri; @@ -26,6 +27,7 @@ pub use file::{FileExtensions, FileType}; pub use input::{Input, InputContent, InputResolver, InputSource}; pub use request::Request; pub use response::{Response, ResponseBody}; +pub use source_base_info::SourceBaseInfo; pub use status::Status; pub use status_code::*; diff --git a/lychee-lib/src/types/source_base_info.rs b/lychee-lib/src/types/source_base_info.rs new file mode 100644 index 0000000000..78b68b5e23 --- /dev/null +++ b/lychee-lib/src/types/source_base_info.rs @@ -0,0 +1,100 @@ +use reqwest::Url; +use std::path::Path; + +use crate::Base; +use crate::ErrorKind; +use crate::InputSource; +use crate::Uri; +use crate::types::uri::raw::RawUri; +use crate::utils::reqwest::ReqwestUrlExt; +use crate::utils::url::apply_rooted_base_url; + +#[derive(Debug, PartialEq, Eq, Clone)] +pub struct SourceBaseInfo { + origin: Url, + subpath: String, + allow_absolute: bool, + remote_local_mappings: Vec<(Url, Url)>, +} + +impl SourceBaseInfo { + fn infer_source_base(url: &Url) -> Option<(Url, String, bool)> { + let origin = url.join("/").ok()?; + let subpath = origin.make_relative(&url)?; + Some((origin, subpath, url.scheme() != "file")) + } + + pub fn from_source( + source: &InputSource, + root_dir: Option<&Path>, + base: Option<&Base>, + ) -> Result, ErrorKind> { + let root_dir_url = root_dir + .map(|path| Base::Local(path.to_owned()).to_url()) + .transpose()?; + + println!("{:?}", base.clone()); + let base_url: Option = base + .map(Base::to_url) + .transpose()? + .or_else(|| root_dir_url.clone()); + + let source_url = source.to_url()?; + + let Some(source_url) = source_url else { + return Ok(None); + }; + + let remote_local_mappings = match (base_url, root_dir_url) { + (Some(base_url), Some(root_dir_url)) => vec![(base_url, root_dir_url)], + _ => vec![], + }; + + let (origin, subpath, allow_absolute) = remote_local_mappings + .iter() + .find_map(|(remote, local)| { + source_url + .strip_prefix(local) + .map(|subpath| (remote.clone(), subpath, true)) + }) + .map_or_else( + || SourceBaseInfo::infer_source_base(&source_url).ok_or(ErrorKind::InvalidUrlHost), + Ok, + )? + .into(); + + Ok(Some(Self { + origin, + subpath, + allow_absolute, + remote_local_mappings, + })) + } + + pub fn parse_uri(&self, raw_uri: &RawUri) -> Result { + let Self { + origin, + subpath, + allow_absolute, + remote_local_mappings, + } = self; + + let is_absolute = raw_uri.text.trim_ascii_start().starts_with("/"); + if !allow_absolute && is_absolute { + return Err(ErrorKind::InvalidBaseJoin(raw_uri.text.clone())); + } + + match apply_rooted_base_url(&origin, &[&subpath, &raw_uri.text]) { + Ok(url) => remote_local_mappings + .iter() + .find_map(|(remote, local)| { + url.strip_prefix(remote) + .and_then(|subpath| local.join(&subpath).ok()) + }) + .map_or(Ok(url), Ok), + Err(e) => Err(e), + } + .map_err(|e| ErrorKind::ParseUrl(e, raw_uri.text.clone())) + .map(|url| Uri { url }) + } +} diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index d95fe2aef5..6febb8d64b 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -9,6 +9,7 @@ use std::{ path::{Path, PathBuf}, }; +use crate::types::SourceBaseInfo; use crate::{ Base, BasicAuthCredentials, ErrorKind, Request, Result, Uri, basic_auth::BasicAuthExtractor, @@ -29,11 +30,13 @@ pub(crate) fn extract_credentials( fn create_request( raw_uri: &RawUri, source: &InputSource, - root_dir: Option<&Path>, - base: Option<&Base>, + base_info: Option<&SourceBaseInfo>, extractor: Option<&BasicAuthExtractor>, ) -> Result { - let uri = try_parse_into_uri(raw_uri, source, root_dir, base)?; + let uri = Uri::try_from(raw_uri.clone()).or_else(|e| match base_info { + Some(base_info) => base_info.parse_uri(raw_uri), + None => Err(e), // TODO: more precise error kind? + })?; let source = truncate_source(source); let element = raw_uri.element.clone(); let attribute = raw_uri.attribute.clone(); @@ -42,39 +45,6 @@ fn create_request( Ok(Request::new(uri, source, element, attribute, credentials)) } -fn apply_base(base: &Url, subpath: &str, link: &str) -> std::result::Result { - // println!("applying {}, {}, {}", base, subpath, link); - // tests: - // - .. out of local base should be blocked. - // - scheme-relative urls should work and not spuriously trigger base url - // - fully-qualified urls should work - // - slash should work to go to local base, if specified - // - slash should be forbidden for inferred base urls. - // - percent encoding ;-; - // - trailing slashes in base-url and/or root-dir - // - fragments and query params, on both http and file - let fake_base = match base.scheme() { - "file" => { - let mut fake_base = base.join("/")?; - fake_base.set_host(Some("secret-lychee-base-url.invalid"))?; - Some(fake_base) - } - _ => None, - }; - - let url = fake_base - .as_ref() - .unwrap_or(base) - .join(subpath)? - .join(link)?; - - match fake_base.and_then(|b| b.make_relative(&url)) { - Some(relative_to_base) => base.join(&relative_to_base), - None => Ok(url), - } - .inspect(|x| println!("---> {}", x)) -} - /// Try to parse the raw URI into a `Uri`. /// /// If the raw URI is not a valid URI, create a URI by joining the base URL with the text. @@ -153,14 +123,17 @@ fn try_parse_into_uri( Some((_, _, false)) if raw_uri.text.trim_ascii_start().starts_with("/") => { Err(ParseError::RelativeUrlWithoutBase) } - Some((base, subpath, _allow_absolute)) => apply_base(&base, &subpath, &raw_uri.text) - .and_then(|url| match (base_url.as_deref(), &root_dir_url) { - (Some(base_url), Some(root_dir_url)) => url - .strip_prefix(&base_url) - .and_then(|subpath| root_dir_url.join(&subpath).ok()) - .map_or(Ok(url), Ok), - _ => Ok(url), - }), + Some((base, subpath, _allow_absolute)) => { + url::apply_rooted_base_url(&base, &[&subpath, &raw_uri.text]).and_then(|url| { + match (base_url.as_deref(), &root_dir_url) { + (Some(base_url), Some(root_dir_url)) => url + .strip_prefix(&base_url) + .and_then(|subpath| root_dir_url.join(&subpath).ok()) + .map_or(Ok(url), Ok), + _ => Ok(url), + } + }) + } None => Url::parse(&raw_uri.text), } .inspect(|x| println!("OUT -----> {}", x)) @@ -250,16 +223,25 @@ pub(crate) fn create( base: Option<&Base>, extractor: Option<&BasicAuthExtractor>, ) -> HashSet { + let base_info = match SourceBaseInfo::from_source(source, root_dir, base) { + Ok(base_info) => base_info, + Err(e) => { + let source = truncate_source(source); + warn!("Error handling source {}: {:?}", source, e); + return HashSet::new(); + } + }; + uris.into_iter() - .filter_map( - |raw_uri| match create_request(&raw_uri, source, root_dir, base, extractor) { + .filter_map(|raw_uri| { + match create_request(&raw_uri, source, base_info.as_ref(), extractor) { Ok(request) => Some(request), Err(e) => { warn!("Error creating request: {e:?}"); None } - }, - ) + } + }) .collect() } diff --git a/lychee-lib/src/utils/url.rs b/lychee-lib/src/utils/url.rs index dc850c6fb8..8ae6790b5f 100644 --- a/lychee-lib/src/utils/url.rs +++ b/lychee-lib/src/utils/url.rs @@ -1,6 +1,8 @@ +use std::borrow::Cow; use std::sync::LazyLock; use linkify::LinkFinder; +use reqwest::Url; static LINK_FINDER: LazyLock = LazyLock::new(LinkFinder::new); @@ -18,6 +20,41 @@ pub(crate) fn remove_get_params_and_separate_fragment(url: &str) -> (&str, Optio (path, frag) } +pub fn apply_rooted_base_url( + base: &Url, + subpaths: &[&str], +) -> std::result::Result { + // println!("applying {}, {}, {}", base, subpath, link); + // tests: + // - .. out of local base should be blocked. + // - scheme-relative urls should work and not spuriously trigger base url + // - fully-qualified urls should work + // - slash should work to go to local base, if specified + // - slash should be forbidden for inferred base urls. + // - percent encoding ;-; + // - trailing slashes in base-url and/or root-dir + // - fragments and query params, on both http and file + let fake_base = match base.scheme() { + "file" => { + let mut fake_base = base.join("/")?; + fake_base.set_host(Some("secret-lychee-base-url.invalid"))?; + Some(fake_base) + } + _ => None, + }; + + let mut url = Cow::Borrowed(fake_base.as_ref().unwrap_or(base)); + for subpath in subpaths { + url = Cow::Owned(url.join(subpath)?); + } + + match fake_base.as_ref().and_then(|b| b.make_relative(&url)) { + Some(relative_to_base) => base.join(&relative_to_base), + None => Ok(url.into_owned()), + } + .inspect(|x| println!("---> {}", x)) +} + // Use `LinkFinder` to offload the raw link searching in plaintext pub(crate) fn find_links(input: &str) -> impl Iterator> { LINK_FINDER.links(input) From 439d7f949b82f3a60094b012bc67c12107e11f0a Mon Sep 17 00:00:00 2001 From: rina Date: Sat, 6 Sep 2025 23:15:36 +1000 Subject: [PATCH 16/59] clippy fix --- lychee-lib/src/types/base.rs | 2 +- lychee-lib/src/types/input/source.rs | 2 +- lychee-lib/src/types/source_base_info.rs | 9 ++++----- lychee-lib/src/utils/request.rs | 12 +++++------- lychee-lib/src/utils/url.rs | 4 ++-- 5 files changed, 13 insertions(+), 16 deletions(-) diff --git a/lychee-lib/src/types/base.rs b/lychee-lib/src/types/base.rs index 1111f7b9cc..1e9e61bf86 100644 --- a/lychee-lib/src/types/base.rs +++ b/lychee-lib/src/types/base.rs @@ -52,7 +52,7 @@ impl Base { // We keep the username and password intact Some(Base::Remote(*base_url)) } - InputSource::FsPath(path) => path.to_path_buf().canonicalize().ok().map(Base::Local), + InputSource::FsPath(path) => path.clone().canonicalize().ok().map(Base::Local), // other inputs do not have a URL to extract a base _ => None, } diff --git a/lychee-lib/src/types/input/source.rs b/lychee-lib/src/types/input/source.rs index 0fd050f17a..67ee3cde19 100644 --- a/lychee-lib/src/types/input/source.rs +++ b/lychee-lib/src/types/input/source.rs @@ -14,7 +14,7 @@ //! and filtered by extension //! - URLs, raw strings, and standard input (`stdin`) are read directly -use crate::{Base, ErrorKind}; +use crate::ErrorKind; use reqwest::Url; use serde::{Deserialize, Serialize}; use std::fmt::Display; diff --git a/lychee-lib/src/types/source_base_info.rs b/lychee-lib/src/types/source_base_info.rs index 78b68b5e23..3eaf40a877 100644 --- a/lychee-lib/src/types/source_base_info.rs +++ b/lychee-lib/src/types/source_base_info.rs @@ -20,7 +20,7 @@ pub struct SourceBaseInfo { impl SourceBaseInfo { fn infer_source_base(url: &Url) -> Option<(Url, String, bool)> { let origin = url.join("/").ok()?; - let subpath = origin.make_relative(&url)?; + let subpath = origin.make_relative(url)?; Some((origin, subpath, url.scheme() != "file")) } @@ -60,8 +60,7 @@ impl SourceBaseInfo { .map_or_else( || SourceBaseInfo::infer_source_base(&source_url).ok_or(ErrorKind::InvalidUrlHost), Ok, - )? - .into(); + )?; Ok(Some(Self { origin, @@ -79,12 +78,12 @@ impl SourceBaseInfo { remote_local_mappings, } = self; - let is_absolute = raw_uri.text.trim_ascii_start().starts_with("/"); + let is_absolute = raw_uri.text.trim_ascii_start().starts_with('/'); if !allow_absolute && is_absolute { return Err(ErrorKind::InvalidBaseJoin(raw_uri.text.clone())); } - match apply_rooted_base_url(&origin, &[&subpath, &raw_uri.text]) { + match apply_rooted_base_url(origin, &[subpath, &raw_uri.text]) { Ok(url) => remote_local_mappings .iter() .find_map(|(remote, local)| { diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 6febb8d64b..07456aac69 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -2,8 +2,6 @@ use log::warn; use percent_encoding::percent_decode_str; use reqwest::Url; use std::borrow::Cow; -use std::ops::Deref; -use std::sync::LazyLock; use std::{ collections::HashSet, path::{Path, PathBuf}, @@ -92,7 +90,7 @@ fn try_parse_into_uri( let infer_source_base = |url: &Url| -> Option<_> { let top = url.join("/").ok()?; - let subpath = top.make_relative(&url)?; + let subpath = top.make_relative(url)?; Some((Cow::Owned(top), Cow::Owned(subpath), url.scheme() != "file")) }; @@ -120,14 +118,14 @@ fn try_parse_into_uri( // }; match base_info { - Some((_, _, false)) if raw_uri.text.trim_ascii_start().starts_with("/") => { + Some((_, _, false)) if raw_uri.text.trim_ascii_start().starts_with('/') => { Err(ParseError::RelativeUrlWithoutBase) } Some((base, subpath, _allow_absolute)) => { url::apply_rooted_base_url(&base, &[&subpath, &raw_uri.text]).and_then(|url| { match (base_url.as_deref(), &root_dir_url) { (Some(base_url), Some(root_dir_url)) => url - .strip_prefix(&base_url) + .strip_prefix(base_url) .and_then(|subpath| root_dir_url.join(&subpath).ok()) .map_or(Ok(url), Ok), _ => Ok(url), @@ -136,7 +134,7 @@ fn try_parse_into_uri( } None => Url::parse(&raw_uri.text), } - .inspect(|x| println!("OUT -----> {}", x)) + .inspect(|x| println!("OUT -----> {x}")) .map_err(|e| ErrorKind::ParseUrl(e, raw_uri.text.clone())) .map(|url| Uri { url }) @@ -227,7 +225,7 @@ pub(crate) fn create( Ok(base_info) => base_info, Err(e) => { let source = truncate_source(source); - warn!("Error handling source {}: {:?}", source, e); + warn!("Error handling source {source}: {e:?}"); return HashSet::new(); } }; diff --git a/lychee-lib/src/utils/url.rs b/lychee-lib/src/utils/url.rs index 8ae6790b5f..26f42f99a9 100644 --- a/lychee-lib/src/utils/url.rs +++ b/lychee-lib/src/utils/url.rs @@ -20,7 +20,7 @@ pub(crate) fn remove_get_params_and_separate_fragment(url: &str) -> (&str, Optio (path, frag) } -pub fn apply_rooted_base_url( +pub(crate) fn apply_rooted_base_url( base: &Url, subpaths: &[&str], ) -> std::result::Result { @@ -52,7 +52,7 @@ pub fn apply_rooted_base_url( Some(relative_to_base) => base.join(&relative_to_base), None => Ok(url.into_owned()), } - .inspect(|x| println!("---> {}", x)) + .inspect(|x| println!("---> {x}")) } // Use `LinkFinder` to offload the raw link searching in plaintext From 49f84ba48810ddee3e03d7cc56352fef3b9e3386 Mon Sep 17 00:00:00 2001 From: rina Date: Sun, 7 Sep 2025 09:49:49 +1000 Subject: [PATCH 17/59] rename --- lychee-lib/src/types/{source_base_info.rs => base_info.rs} | 0 lychee-lib/src/types/mod.rs | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) rename lychee-lib/src/types/{source_base_info.rs => base_info.rs} (100%) diff --git a/lychee-lib/src/types/source_base_info.rs b/lychee-lib/src/types/base_info.rs similarity index 100% rename from lychee-lib/src/types/source_base_info.rs rename to lychee-lib/src/types/base_info.rs diff --git a/lychee-lib/src/types/mod.rs b/lychee-lib/src/types/mod.rs index dc245dec1b..4091ba793b 100644 --- a/lychee-lib/src/types/mod.rs +++ b/lychee-lib/src/types/mod.rs @@ -2,6 +2,7 @@ mod accept; mod base; +mod base_info; mod basic_auth; mod cache; mod cookies; @@ -12,7 +13,6 @@ pub(crate) mod mail; mod request; pub(crate) mod resolver; mod response; -mod source_base_info; mod status; mod status_code; pub(crate) mod uri; @@ -27,7 +27,7 @@ pub use file::{FileExtensions, FileType}; pub use input::{Input, InputContent, InputResolver, InputSource}; pub use request::Request; pub use response::{Response, ResponseBody}; -pub use source_base_info::SourceBaseInfo; +pub use base_info::SourceBaseInfo; pub use status::Status; pub use status_code::*; From 6c3181151a7edd5365e3393f1a88ebf14e2be782 Mon Sep 17 00:00:00 2001 From: rina Date: Sun, 7 Sep 2025 10:22:08 +1000 Subject: [PATCH 18/59] refactor to move more things into SourceBaseInfo. TODO: fix+new tests --- lychee-lib/src/types/base_info.rs | 111 ++++++++++++++++++------------ lychee-lib/src/types/mod.rs | 2 +- lychee-lib/src/utils/request.rs | 37 +++++----- lychee-lib/src/utils/reqwest.rs | 14 ---- lychee-lib/src/utils/url.rs | 83 +++++++++++++--------- 5 files changed, 132 insertions(+), 115 deletions(-) diff --git a/lychee-lib/src/types/base_info.rs b/lychee-lib/src/types/base_info.rs index 3eaf40a877..853b6c8ed6 100644 --- a/lychee-lib/src/types/base_info.rs +++ b/lychee-lib/src/types/base_info.rs @@ -6,29 +6,55 @@ use crate::ErrorKind; use crate::InputSource; use crate::Uri; use crate::types::uri::raw::RawUri; -use crate::utils::reqwest::ReqwestUrlExt; -use crate::utils::url::apply_rooted_base_url; +use crate::utils::url::ReqwestUrlExt; #[derive(Debug, PartialEq, Eq, Clone)] pub struct SourceBaseInfo { - origin: Url, - subpath: String, - allow_absolute: bool, + /// Tuple of `origin`, `subpath`, `allow_absolute` + base: Option<(Url, String, bool)>, remote_local_mappings: Vec<(Url, Url)>, } impl SourceBaseInfo { - fn infer_source_base(url: &Url) -> Option<(Url, String, bool)> { - let origin = url.join("/").ok()?; - let subpath = origin.make_relative(url)?; - Some((origin, subpath, url.scheme() != "file")) + pub fn new( + base: Option<(Url, String, bool)>, + remote_local_mappings: Vec<(Url, Url)>, + ) -> Result { + let conflicting_mapping = remote_local_mappings.iter().find(|(remote, local)| { + if remote == local { + false + } else { + remote.strip_prefix(local).is_some() || local.strip_prefix(remote).is_some() + } + }); + + match conflicting_mapping { + Some((base, root)) => Err(ErrorKind::InvalidBase( + base.to_string(), + format!("base is parent or child of {root}"), + )), + None => Ok(Self { + base, + remote_local_mappings, + }), + } + } + + fn infer_default_base(url: &Url) -> Result<(Url, String, bool), ErrorKind> { + let origin = url + .join("/") + .map_err(|e| ErrorKind::ParseUrl(e, url.to_string()))?; + let subpath = origin + .make_relative(url) + .expect("failed make a url relative to its own origin root?!"); + Ok((origin, subpath, url.scheme() != "file")) } pub fn from_source( source: &InputSource, root_dir: Option<&Path>, base: Option<&Base>, - ) -> Result, ErrorKind> { + ) -> Result { let root_dir_url = root_dir .map(|path| Base::Local(path.to_owned()).to_url()) .transpose()?; @@ -41,59 +67,54 @@ impl SourceBaseInfo { let source_url = source.to_url()?; - let Some(source_url) = source_url else { - return Ok(None); - }; - let remote_local_mappings = match (base_url, root_dir_url) { (Some(base_url), Some(root_dir_url)) => vec![(base_url, root_dir_url)], _ => vec![], }; - let (origin, subpath, allow_absolute) = remote_local_mappings + let Some(source_url) = source_url else { + return Self::new(None, remote_local_mappings); + }; + + let base = remote_local_mappings .iter() .find_map(|(remote, local)| { source_url .strip_prefix(local) .map(|subpath| (remote.clone(), subpath, true)) }) - .map_or_else( - || SourceBaseInfo::infer_source_base(&source_url).ok_or(ErrorKind::InvalidUrlHost), - Ok, - )?; - - Ok(Some(Self { - origin, - subpath, - allow_absolute, - remote_local_mappings, - })) + .map_or_else(|| SourceBaseInfo::infer_default_base(&source_url), Ok)?; + + Self::new(Some(base), remote_local_mappings) } pub fn parse_uri(&self, raw_uri: &RawUri) -> Result { let Self { - origin, - subpath, - allow_absolute, + base, remote_local_mappings, } = self; - let is_absolute = raw_uri.text.trim_ascii_start().starts_with('/'); - if !allow_absolute && is_absolute { - return Err(ErrorKind::InvalidBaseJoin(raw_uri.text.clone())); - } + let is_absolute = || raw_uri.text.trim_ascii_start().starts_with('/'); - match apply_rooted_base_url(origin, &[subpath, &raw_uri.text]) { - Ok(url) => remote_local_mappings - .iter() - .find_map(|(remote, local)| { - url.strip_prefix(remote) - .and_then(|subpath| local.join(&subpath).ok()) - }) - .map_or(Ok(url), Ok), - Err(e) => Err(e), - } - .map_err(|e| ErrorKind::ParseUrl(e, raw_uri.text.clone())) - .map(|url| Uri { url }) + let Uri { url } = Uri::try_from(raw_uri.clone()).or_else(|e| match base { + Some((_, _, _allow_absolute @ false)) if is_absolute() => { + Err(ErrorKind::InvalidBaseJoin(raw_uri.text.clone())) + } + Some((origin, subpath, _)) => origin + .join_rooted(&[subpath, &raw_uri.text]) + .map_err(|e| ErrorKind::ParseUrl(e, raw_uri.text.clone())) + .map(|url| Uri { url }), + None => Err(e), + })?; + + let url = remote_local_mappings + .iter() + .find_map(|(remote, local)| { + url.strip_prefix(remote) + .and_then(|subpath| local.join(&subpath).ok()) + }) + .unwrap_or(url); + + Ok(Uri { url }) } } diff --git a/lychee-lib/src/types/mod.rs b/lychee-lib/src/types/mod.rs index 4091ba793b..f75a5f1f9c 100644 --- a/lychee-lib/src/types/mod.rs +++ b/lychee-lib/src/types/mod.rs @@ -19,6 +19,7 @@ pub(crate) mod uri; pub use accept::*; pub use base::Base; +pub use base_info::SourceBaseInfo; pub use basic_auth::{BasicAuthCredentials, BasicAuthSelector}; pub use cache::CacheStatus; pub use cookies::CookieJar; @@ -27,7 +28,6 @@ pub use file::{FileExtensions, FileType}; pub use input::{Input, InputContent, InputResolver, InputSource}; pub use request::Request; pub use response::{Response, ResponseBody}; -pub use base_info::SourceBaseInfo; pub use status::Status; pub use status_code::*; diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 07456aac69..0b06fbfab3 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -12,7 +12,7 @@ use crate::{ Base, BasicAuthCredentials, ErrorKind, Request, Result, Uri, basic_auth::BasicAuthExtractor, types::{InputSource, uri::raw::RawUri}, - utils::{path, reqwest::ReqwestUrlExt, url}, + utils::{path, url, url::ReqwestUrlExt}, }; use ::url::ParseError; @@ -28,13 +28,10 @@ pub(crate) fn extract_credentials( fn create_request( raw_uri: &RawUri, source: &InputSource, - base_info: Option<&SourceBaseInfo>, + base_info: &SourceBaseInfo, extractor: Option<&BasicAuthExtractor>, ) -> Result { - let uri = Uri::try_from(raw_uri.clone()).or_else(|e| match base_info { - Some(base_info) => base_info.parse_uri(raw_uri), - None => Err(e), // TODO: more precise error kind? - })?; + let uri = base_info.parse_uri(raw_uri)?; let source = truncate_source(source); let element = raw_uri.element.clone(); let attribute = raw_uri.attribute.clone(); @@ -121,17 +118,15 @@ fn try_parse_into_uri( Some((_, _, false)) if raw_uri.text.trim_ascii_start().starts_with('/') => { Err(ParseError::RelativeUrlWithoutBase) } - Some((base, subpath, _allow_absolute)) => { - url::apply_rooted_base_url(&base, &[&subpath, &raw_uri.text]).and_then(|url| { - match (base_url.as_deref(), &root_dir_url) { - (Some(base_url), Some(root_dir_url)) => url - .strip_prefix(base_url) - .and_then(|subpath| root_dir_url.join(&subpath).ok()) - .map_or(Ok(url), Ok), - _ => Ok(url), - } - }) - } + Some((base, subpath, _allow_absolute)) => base + .join_rooted(&[&subpath, &raw_uri.text]) + .and_then(|url| match (base_url.as_deref(), &root_dir_url) { + (Some(base_url), Some(root_dir_url)) => url + .strip_prefix(base_url) + .and_then(|subpath| root_dir_url.join(&subpath).ok()) + .map_or(Ok(url), Ok), + _ => Ok(url), + }), None => Url::parse(&raw_uri.text), } .inspect(|x| println!("OUT -----> {x}")) @@ -231,15 +226,15 @@ pub(crate) fn create( }; uris.into_iter() - .filter_map(|raw_uri| { - match create_request(&raw_uri, source, base_info.as_ref(), extractor) { + .filter_map( + |raw_uri| match create_request(&raw_uri, source, &base_info, extractor) { Ok(request) => Some(request), Err(e) => { warn!("Error creating request: {e:?}"); None } - } - }) + }, + ) .collect() } diff --git a/lychee-lib/src/utils/reqwest.rs b/lychee-lib/src/utils/reqwest.rs index 23d9bdfeed..8c54203e52 100644 --- a/lychee-lib/src/utils/reqwest.rs +++ b/lychee-lib/src/utils/reqwest.rs @@ -1,19 +1,5 @@ use std::error::Error; -pub(crate) trait ReqwestUrlExt { - fn strip_prefix(&self, prefix: &reqwest::Url) -> Option; -} - -impl ReqwestUrlExt for reqwest::Url { - fn strip_prefix(&self, prefix: &reqwest::Url) -> Option { - prefix - .make_relative(self) - .filter(|subpath| !subpath.starts_with("../")) - // .inspect(|x| println!("subpathing {}", x)) - // .filter(|_| prefix.as_str().starts_with(self.as_str())) - } -} - /// A rule for matching error message patterns to human-readable messages struct ErrorRule { patterns: &'static [&'static str], diff --git a/lychee-lib/src/utils/url.rs b/lychee-lib/src/utils/url.rs index 26f42f99a9..7499252252 100644 --- a/lychee-lib/src/utils/url.rs +++ b/lychee-lib/src/utils/url.rs @@ -3,6 +3,7 @@ use std::sync::LazyLock; use linkify::LinkFinder; use reqwest::Url; +use url::ParseError; static LINK_FINDER: LazyLock = LazyLock::new(LinkFinder::new); @@ -20,44 +21,58 @@ pub(crate) fn remove_get_params_and_separate_fragment(url: &str) -> (&str, Optio (path, frag) } -pub(crate) fn apply_rooted_base_url( - base: &Url, - subpaths: &[&str], -) -> std::result::Result { - // println!("applying {}, {}, {}", base, subpath, link); - // tests: - // - .. out of local base should be blocked. - // - scheme-relative urls should work and not spuriously trigger base url - // - fully-qualified urls should work - // - slash should work to go to local base, if specified - // - slash should be forbidden for inferred base urls. - // - percent encoding ;-; - // - trailing slashes in base-url and/or root-dir - // - fragments and query params, on both http and file - let fake_base = match base.scheme() { - "file" => { - let mut fake_base = base.join("/")?; - fake_base.set_host(Some("secret-lychee-base-url.invalid"))?; - Some(fake_base) - } - _ => None, - }; +// Use `LinkFinder` to offload the raw link searching in plaintext +pub(crate) fn find_links(input: &str) -> impl Iterator> { + LINK_FINDER.links(input) +} - let mut url = Cow::Borrowed(fake_base.as_ref().unwrap_or(base)); - for subpath in subpaths { - url = Cow::Owned(url.join(subpath)?); - } +pub(crate) trait ReqwestUrlExt { + fn strip_prefix(&self, prefix: &reqwest::Url) -> Option; + fn join_rooted(&self, subpaths: &[&str]) -> Result; +} - match fake_base.as_ref().and_then(|b| b.make_relative(&url)) { - Some(relative_to_base) => base.join(&relative_to_base), - None => Ok(url.into_owned()), +impl ReqwestUrlExt for reqwest::Url { + fn strip_prefix(&self, prefix: &reqwest::Url) -> Option { + prefix + .make_relative(self) + .filter(|subpath| !subpath.starts_with("../")) + // .inspect(|x| println!("subpathing {}", x)) + // .filter(|_| prefix.as_str().starts_with(self.as_str())) } - .inspect(|x| println!("---> {x}")) -} -// Use `LinkFinder` to offload the raw link searching in plaintext -pub(crate) fn find_links(input: &str) -> impl Iterator> { - LINK_FINDER.links(input) + fn join_rooted(&self, subpaths: &[&str]) -> Result { + let base = self; + // println!("applying {}, {}, {}", base, subpath, link); + // tests: + // - .. out of local base should be blocked. + // - scheme-relative urls should work and not spuriously trigger base url + // - fully-qualified urls should work + // - slash should work to go to local base, if specified + // - slash should be forbidden for inferred base urls. + // - percent encoding ;-; + // - trailing slashes in base-url and/or root-dir + // - fragments and query params, on both http and file + // - windows file paths ;-; + let fake_base = match base.scheme() { + "file" => { + let mut fake_base = base.join("/")?; + fake_base.set_host(Some("secret-lychee-base-url.invalid"))?; + Some(fake_base) + } + _ => None, + }; + + let mut url = Cow::Borrowed(fake_base.as_ref().unwrap_or(base)); + for subpath in subpaths { + url = Cow::Owned(url.join(subpath)?); + } + + match fake_base.as_ref().and_then(|b| b.make_relative(&url)) { + Some(relative_to_base) => base.join(&relative_to_base), + None => Ok(url.into_owned()), + } + .inspect(|x| println!("---> {x}")) + } } #[cfg(test)] From a1249ae0a21b4f59138312712e060c97e48bd6e6 Mon Sep 17 00:00:00 2001 From: rina Date: Sun, 7 Sep 2025 10:37:31 +1000 Subject: [PATCH 19/59] touch --- lychee-lib/src/types/base_info.rs | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/lychee-lib/src/types/base_info.rs b/lychee-lib/src/types/base_info.rs index 853b6c8ed6..0ef3b88ead 100644 --- a/lychee-lib/src/types/base_info.rs +++ b/lychee-lib/src/types/base_info.rs @@ -89,14 +89,9 @@ impl SourceBaseInfo { } pub fn parse_uri(&self, raw_uri: &RawUri) -> Result { - let Self { - base, - remote_local_mappings, - } = self; - let is_absolute = || raw_uri.text.trim_ascii_start().starts_with('/'); - let Uri { url } = Uri::try_from(raw_uri.clone()).or_else(|e| match base { + let Uri { url } = Uri::try_from(raw_uri.clone()).or_else(|e| match &self.base { Some((_, _, _allow_absolute @ false)) if is_absolute() => { Err(ErrorKind::InvalidBaseJoin(raw_uri.text.clone())) } @@ -107,7 +102,8 @@ impl SourceBaseInfo { None => Err(e), })?; - let url = remote_local_mappings + let url = self + .remote_local_mappings .iter() .find_map(|(remote, local)| { url.strip_prefix(remote) From 24e5eb7b2a2147bdcada7dc270ded5aa2336efd7 Mon Sep 17 00:00:00 2001 From: rina Date: Sun, 7 Sep 2025 10:51:38 +1000 Subject: [PATCH 20/59] fix existing tests --- lychee-lib/src/utils/request.rs | 121 +++----------------------------- 1 file changed, 9 insertions(+), 112 deletions(-) diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 0b06fbfab3..9250d4bd80 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -40,114 +40,16 @@ fn create_request( Ok(Request::new(uri, source, element, attribute, credentials)) } -/// Try to parse the raw URI into a `Uri`. -/// -/// If the raw URI is not a valid URI, create a URI by joining the base URL with the text. -/// If the base URL is not available, create a URI from the file path. -/// -/// # Errors -/// -/// - If the text (the unparsed URI represented as a `String`) cannot be joined with the base -/// to create a valid URI. -/// - If a URI cannot be created from the file path. -/// - If the source is not a file path (i.e. the URI type is not supported). +/// Shim to [`SourceBaseInfo`] for testing. This function is no longer +/// used by the main execution. fn try_parse_into_uri( raw_uri: &RawUri, source: &InputSource, root_dir: Option<&Path>, base: Option<&Base>, ) -> Result { - let text = prepend_root_dir_if_absolute_local_link(&raw_uri.text, root_dir); - - let root_dir_url = root_dir - .map(|path| Base::Local(path.to_owned()).to_url()) - .transpose()?; - - // println!("{:?}", base.clone()); - let base_url: Option> = base - .map(Base::to_url) - .transpose()? - .map(Cow::Owned) - .or(root_dir_url.as_ref().map(Cow::Borrowed)); - // println!("{:?}", apply_base(&raw_uri.text, base.as_ref())); - - // println!("{:?}", base.clone().unwrap().join("not rooted")); - // println!("{:?}", base.clone().unwrap().join("/rooted")); - - // 1. graft input source - if source is a FsPath subdirectory of root_dir, - // replace InputSource with RemoteUrl. - - // 2. map root_dir to base. - - // let root_dir_url = root_dir - // .map(Path::to_string_lossy) - // .map(|x| Url::from_file_path(&*x).expect("file path to url failed?!")); - // let root_dir = root_dir_url.as_ref().map(Url::as_str); - // - - let infer_source_base = |url: &Url| -> Option<_> { - let top = url.join("/").ok()?; - let subpath = top.make_relative(url)?; - Some((Cow::Owned(top), Cow::Owned(subpath), url.scheme() != "file")) - }; - - let source_url = source.to_url()?; - - let base_info = match &source_url { - Some(source_url) => match (base_url.as_deref(), &root_dir_url) { - (Some(base_url), Some(root_dir_url)) => source_url - .strip_prefix(root_dir_url) - .map(|subpath| (Cow::Borrowed(base_url), Cow::Owned(subpath), true)), - _ => None, - } - .map_or_else( - || infer_source_base(source_url).ok_or(ErrorKind::InvalidUrlHost), - Ok, - )? - .into(), - None => None, - }; - println!("{} {:?}", &raw_uri.text, &base_info); - - // match Uri::try_from(raw_uri.clone()) { - // Ok(uri) => return Ok(uri), - // _ => (), - // }; - - match base_info { - Some((_, _, false)) if raw_uri.text.trim_ascii_start().starts_with('/') => { - Err(ParseError::RelativeUrlWithoutBase) - } - Some((base, subpath, _allow_absolute)) => base - .join_rooted(&[&subpath, &raw_uri.text]) - .and_then(|url| match (base_url.as_deref(), &root_dir_url) { - (Some(base_url), Some(root_dir_url)) => url - .strip_prefix(base_url) - .and_then(|subpath| root_dir_url.join(&subpath).ok()) - .map_or(Ok(url), Ok), - _ => Ok(url), - }), - None => Url::parse(&raw_uri.text), - } - .inspect(|x| println!("OUT -----> {x}")) - .map_err(|e| ErrorKind::ParseUrl(e, raw_uri.text.clone())) - .map(|url| Uri { url }) - - // TODO: MAP BACK TO local root dir by checking if ads starts with base. - - // let uri = match Uri::try_from(raw_uri.clone()) { - // Ok(uri) => uri, - // Err(_) => match base { - // Some(base_url) => match base_url.join(&text) { - // Some(url) => Uri { url }, - // None => return Err(ErrorKind::InvalidBaseJoin(text.clone())), - // }, - // None => panic!("no base :((((("), - // }, - // }; - // println!(" = {uri:?}"); - - // let base = base.and_then(Base::to_url); + let base_info = SourceBaseInfo::from_source(source, root_dir, base)?; + base_info.parse_uri(raw_uri) } // Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs @@ -568,15 +470,10 @@ mod tests { fn test_create_request_from_relative_file_path() { let base = Base::Local(PathBuf::from("/tmp/lychee")); let input_source = InputSource::FsPath(PathBuf::from("page.html")); + let base_info = SourceBaseInfo::from_source(&input_source, None, Some(&base)).unwrap(); - let actual = create_request( - &RawUri::from("file.html"), - &input_source, - None, - Some(&base), - None, - ) - .unwrap(); + let actual = + create_request(&RawUri::from("file.html"), &input_source, &base_info, None).unwrap(); assert_eq!( actual, @@ -596,13 +493,13 @@ mod tests { fn test_create_request_from_absolute_file_path() { let base = Base::Local(PathBuf::from("/tmp/lychee")); let input_source = InputSource::FsPath(PathBuf::from("/tmp/lychee/page.html")); + let base_info = SourceBaseInfo::from_source(&input_source, None, Some(&base)).unwrap(); // Use an absolute path that's outside the base directory let actual = create_request( &RawUri::from("/usr/local/share/doc/example.html"), &input_source, - None, - Some(&base), + &base_info, None, ) .unwrap(); From 7ac50fc28d54543ce9b2c3131e35233b359095f8 Mon Sep 17 00:00:00 2001 From: rina Date: Sun, 7 Sep 2025 10:54:52 +1000 Subject: [PATCH 21/59] no print --- lychee-lib/src/collector.rs | 1 - lychee-lib/src/types/base_info.rs | 2 +- lychee-lib/src/utils/url.rs | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 1b79937e11..2b3c35ae0b 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -289,7 +289,6 @@ impl Collector { async move { let content = content?; let uris: Vec = extractor.extract(&content); - println!("{:?}", &uris); let requests = request::create( uris, &content.source, diff --git a/lychee-lib/src/types/base_info.rs b/lychee-lib/src/types/base_info.rs index 0ef3b88ead..4b9ba2b816 100644 --- a/lychee-lib/src/types/base_info.rs +++ b/lychee-lib/src/types/base_info.rs @@ -59,7 +59,7 @@ impl SourceBaseInfo { .map(|path| Base::Local(path.to_owned()).to_url()) .transpose()?; - println!("{:?}", base.clone()); + // println!("{:?}", base.clone()); let base_url: Option = base .map(Base::to_url) .transpose()? diff --git a/lychee-lib/src/utils/url.rs b/lychee-lib/src/utils/url.rs index 7499252252..b453a7df0d 100644 --- a/lychee-lib/src/utils/url.rs +++ b/lychee-lib/src/utils/url.rs @@ -71,7 +71,7 @@ impl ReqwestUrlExt for reqwest::Url { Some(relative_to_base) => base.join(&relative_to_base), None => Ok(url.into_owned()), } - .inspect(|x| println!("---> {x}")) + // .inspect(|x| println!("---> {x}")) } } From 9d1cd42a60c0d7621b5ff82185161ab0bdaa4542 Mon Sep 17 00:00:00 2001 From: rina Date: Sun, 7 Sep 2025 11:21:44 +1000 Subject: [PATCH 22/59] tweak some tests --- lychee-bin/tests/cli.rs | 6 +++--- lychee-lib/src/types/base_info.rs | 20 ++++++++++++++++++-- lychee-lib/src/utils/request.rs | 4 ++-- 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 6ff06d1895..a4221e3e4b 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -425,7 +425,7 @@ mod cli { let dir = fixtures_path().join("resolve_paths"); cmd.arg("--offline") - .arg("--base-url") + .arg("--root-dir") .arg(&dir) .arg(dir.join("index.html")) .env_clear() @@ -460,9 +460,9 @@ mod cli { cmd.arg("--offline") .arg("--root-dir") - .arg("/resolve_paths") + .arg(dir.join("resolve_paths")) .arg("--base-url") - .arg(&dir) + .arg(dir.join("resolve_paths")) .arg(dir.join("resolve_paths").join("index.html")) .env_clear() .assert() diff --git a/lychee-lib/src/types/base_info.rs b/lychee-lib/src/types/base_info.rs index 4b9ba2b816..9906a5d0c1 100644 --- a/lychee-lib/src/types/base_info.rs +++ b/lychee-lib/src/types/base_info.rs @@ -7,6 +7,7 @@ use crate::InputSource; use crate::Uri; use crate::types::uri::raw::RawUri; use crate::utils::url::ReqwestUrlExt; +use url::PathSegmentsMut; #[derive(Debug, PartialEq, Eq, Clone)] pub struct SourceBaseInfo { @@ -31,7 +32,7 @@ impl SourceBaseInfo { match conflicting_mapping { Some((base, root)) => Err(ErrorKind::InvalidBase( base.to_string(), - format!("base is parent or child of {root}"), + format!("base cannot be parent or child of root-dir {root}"), )), None => Ok(Self { base, @@ -67,6 +68,16 @@ impl SourceBaseInfo { let source_url = source.to_url()?; + // BACKWARDS COMPAT: if /only/ base-url is given, then apply it + // indiscriminately to all inputs, regardless of source, and apply no mappings. + match (&base_url, &root_dir_url) { + (Some(base_url), None) => { + let (origin, subpath, _) = Self::infer_default_base(base_url)?; + return Self::new(Some((origin, subpath, true)), vec![]); + } + _ => () + } + let remote_local_mappings = match (base_url, root_dir_url) { (Some(base_url), Some(root_dir_url)) => vec![(base_url, root_dir_url)], _ => vec![], @@ -102,7 +113,7 @@ impl SourceBaseInfo { None => Err(e), })?; - let url = self + let mut url = self .remote_local_mappings .iter() .find_map(|(remote, local)| { @@ -111,6 +122,11 @@ impl SourceBaseInfo { }) .unwrap_or(url); + // BACKWARDS COMPAT: delete trailing slash for file urls + if url.scheme() == "file" { + let _ = url.path_segments_mut().as_mut().map(PathSegmentsMut::pop_if_empty); + } + Ok(Uri { url }) } } diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 9250d4bd80..683aeda111 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -436,7 +436,7 @@ mod tests { #[test] fn test_fragment_url_resolution_from_root_dir_and_base_url() { - let root_dir = PathBuf::from("/tmp/lychee"); + let root_dir = PathBuf::from("/some"); let base = Base::try_from("https://example.com/path/page.html").unwrap(); let source = InputSource::FsPath(PathBuf::from("/some/page.html")); @@ -447,7 +447,7 @@ mod tests { assert!( requests .iter() - .any(|r| r.uri.url.as_str() == "https://example.com/path/page.html#fragment") + .any(|r| r.uri.url.as_str() == "/some/page.html#fragment") ); } From 718f462bd6e9723dbd7cf097a3518096d2202f81 Mon Sep 17 00:00:00 2001 From: rina Date: Sun, 7 Sep 2025 11:38:32 +1000 Subject: [PATCH 23/59] blah. honestly idk what was happening with these tests in the past... --- lychee-lib/src/types/base_info.rs | 47 +++++++++++++++++++++++++++++-- lychee-lib/src/utils/request.rs | 6 ++-- 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/lychee-lib/src/types/base_info.rs b/lychee-lib/src/types/base_info.rs index 9906a5d0c1..99d51359ad 100644 --- a/lychee-lib/src/types/base_info.rs +++ b/lychee-lib/src/types/base_info.rs @@ -75,7 +75,7 @@ impl SourceBaseInfo { let (origin, subpath, _) = Self::infer_default_base(base_url)?; return Self::new(Some((origin, subpath, true)), vec![]); } - _ => () + _ => (), } let remote_local_mappings = match (base_url, root_dir_url) { @@ -113,6 +113,8 @@ impl SourceBaseInfo { None => Err(e), })?; + // println!("before mappings: {}", url.as_str()); + let mut url = self .remote_local_mappings .iter() @@ -124,9 +126,50 @@ impl SourceBaseInfo { // BACKWARDS COMPAT: delete trailing slash for file urls if url.scheme() == "file" { - let _ = url.path_segments_mut().as_mut().map(PathSegmentsMut::pop_if_empty); + let _ = url + .path_segments_mut() + .as_mut() + .map(PathSegmentsMut::pop_if_empty); } Ok(Uri { url }) } } + +#[cfg(test)] +mod tests { + use super::*; + use std::path::PathBuf; + + #[test] + fn test_base_with_filename() { + let root_dir = PathBuf::from("/some"); + let base = Base::try_from("https://example.com/path/page2.html").unwrap(); + let source = InputSource::FsPath(PathBuf::from("/some/page.html")); + let base_info = SourceBaseInfo::from_source(&source, Some(&root_dir), Some(&base)).unwrap(); + + assert_eq!( + base_info + .parse_uri(&RawUri::from("#fragment")) + .as_ref() + .map(|x| x.url.as_str()), + Ok("file:///some/page.html#fragment") + ); + } + + #[test] + fn test_base_with_same_filename() { + let root_dir = PathBuf::from("/some/pagex.html"); + let base = Base::try_from("https://example.com/path/page.html").unwrap(); + let source = InputSource::FsPath(PathBuf::from("/some/pagex.html")); + let base_info = SourceBaseInfo::from_source(&source, Some(&root_dir), Some(&base)).unwrap(); + + assert_eq!( + base_info + .parse_uri(&RawUri::from("#fragment")) + .as_ref() + .map(|x| x.url.as_str()), + Ok("file:///some/pagex.html#fragment") + ); + } +} diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 683aeda111..c99e5190d2 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -430,14 +430,14 @@ mod tests { assert!( requests .iter() - .any(|r| r.uri.url.as_str() == "https://example.com/parent") + .any(|r| r.uri.url.as_str() == "file:///parent") ); } #[test] fn test_fragment_url_resolution_from_root_dir_and_base_url() { let root_dir = PathBuf::from("/some"); - let base = Base::try_from("https://example.com/path/page.html").unwrap(); + let base = Base::try_from("https://example.com/path/").unwrap(); let source = InputSource::FsPath(PathBuf::from("/some/page.html")); let uris = vec![RawUri::from("#fragment")]; @@ -447,7 +447,7 @@ mod tests { assert!( requests .iter() - .any(|r| r.uri.url.as_str() == "/some/page.html#fragment") + .any(|r| r.uri.url.as_str() == "file:///some/page.html#fragment") ); } From db70242b1555abb469f62bd00a6b013ec154ce33 Mon Sep 17 00:00:00 2001 From: rina Date: Sun, 7 Sep 2025 11:47:14 +1000 Subject: [PATCH 24/59] lychee-lib tests PASS --- lychee-lib/src/utils/request.rs | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index c99e5190d2..4d5a2da6a3 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -321,7 +321,7 @@ mod tests { #[test] fn test_root_relative_url_resolution_from_root_dir() { let root_dir = PathBuf::from("/tmp/lychee"); - let source = InputSource::FsPath(PathBuf::from("/some/page.html")); + let source = InputSource::FsPath(PathBuf::from("/tmp/lychee/page.html")); let uris = vec![RawUri::from("/root-relative")]; let requests = create(uris, &source, Some(&root_dir), None, None); @@ -370,7 +370,7 @@ mod tests { fn test_relative_url_resolution_from_root_dir_and_base_url() { let root_dir = PathBuf::from("/tmp/lychee"); let base = Base::try_from("https://example.com/path/page.html").unwrap(); - let source = InputSource::FsPath(PathBuf::from("/some/page.html")); + let source = InputSource::FsPath(PathBuf::from("/tmp/lychee/localpage.html")); let uris = vec![RawUri::from("relative.html")]; let requests = create(uris, &source, Some(&root_dir), Some(&base), None); @@ -379,7 +379,7 @@ mod tests { assert!( requests .iter() - .any(|r| r.uri.url.as_str() == "https://example.com/path/relative.html") + .any(|r| r.uri.url.as_str() == "file:///tmp/lychee/relative.html") ); } @@ -392,6 +392,7 @@ mod tests { let uris = vec![RawUri::from("https://another.com/page")]; let requests = create(uris, &source, Some(&root_dir), Some(&base), None); + println!("{:?}", requests); assert_eq!(requests.len(), 1); assert!( requests @@ -404,16 +405,17 @@ mod tests { fn test_root_relative_url_resolution_from_root_dir_and_base_url() { let root_dir = PathBuf::from("/tmp/lychee"); let base = Base::try_from("https://example.com/path/page.html").unwrap(); - let source = InputSource::FsPath(PathBuf::from("/some/page.html")); + let source = InputSource::FsPath(PathBuf::from("/tmp/lychee/localpage.html")); let uris = vec![RawUri::from("/root-relative")]; let requests = create(uris, &source, Some(&root_dir), Some(&base), None); + println!("{:?}", requests); assert_eq!(requests.len(), 1); assert!( requests .iter() - .any(|r| r.uri.url.as_str() == "https://example.com/tmp/lychee/root-relative") + .any(|r| r.uri.url.as_str() == "https://example.com/root-relative") ); } From 9c6436899690c84cc1dc757870f648986e7bc96d Mon Sep 17 00:00:00 2001 From: rina Date: Sun, 7 Sep 2025 12:00:57 +1000 Subject: [PATCH 25/59] strip_prefix tests and fix --- lychee-lib/src/utils/url.rs | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/lychee-lib/src/utils/url.rs b/lychee-lib/src/utils/url.rs index b453a7df0d..48e5f7b452 100644 --- a/lychee-lib/src/utils/url.rs +++ b/lychee-lib/src/utils/url.rs @@ -27,15 +27,15 @@ pub(crate) fn find_links(input: &str) -> impl Iterator> } pub(crate) trait ReqwestUrlExt { - fn strip_prefix(&self, prefix: &reqwest::Url) -> Option; + fn strip_prefix(&self, prefix: &Url) -> Option; fn join_rooted(&self, subpaths: &[&str]) -> Result; } -impl ReqwestUrlExt for reqwest::Url { - fn strip_prefix(&self, prefix: &reqwest::Url) -> Option { +impl ReqwestUrlExt for Url { + fn strip_prefix(&self, prefix: &Url) -> Option { prefix .make_relative(self) - .filter(|subpath| !subpath.starts_with("../")) + .filter(|subpath| !subpath.starts_with("../") && !subpath.starts_with("/")) // .inspect(|x| println!("subpathing {}", x)) // .filter(|_| prefix.as_str().starts_with(self.as_str())) } @@ -75,6 +75,29 @@ impl ReqwestUrlExt for reqwest::Url { } } +#[cfg(test)] +mod test_url_ext { + use super::*; + + #[test] + fn test_strip_prefix() { + // note trailing slashes for subpaths, otherwise everything becomes siblings + let goog = Url::parse("https://goog.com").unwrap(); + let goog_subpath = goog.join("subpath/").unwrap(); + let goog_subsubpath = goog_subpath.join("sub2path/").unwrap(); + + assert_eq!(goog.strip_prefix(&goog).as_deref(), Some("")); + + assert_eq!( + goog_subpath.strip_prefix(&goog).as_deref(), + Some("subpath/") + ); + assert_eq!(goog.strip_prefix(&goog_subpath).as_deref(), None); + + assert_eq!(goog_subpath.strip_prefix(&goog_subsubpath).as_deref(), None); + } +} + #[cfg(test)] mod test_fs_tree { use super::*; From b1bc0a3c83e90cfc36f1c9f6b1176727854f570f Mon Sep 17 00:00:00 2001 From: katrinafyi <39479354+katrinafyi@users.noreply.github.com> Date: Sun, 7 Sep 2025 14:39:07 +1000 Subject: [PATCH 26/59] Update ci.yml --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cd62adb9c5..97b4e5c5e9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,7 +15,7 @@ on: env: CARGO_TERM_COLOR: always CARGO_INCREMENTAL: 0 - RUSTFLAGS: -D warnings + #RUSTFLAGS: -D warnings jobs: test: From 0eb1b51e21688d7523d8ff549532835f3c5dec6c Mon Sep 17 00:00:00 2001 From: katrinafyi <39479354+katrinafyi@users.noreply.github.com> Date: Sun, 7 Sep 2025 14:43:08 +1000 Subject: [PATCH 27/59] Update ci.yml --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 97b4e5c5e9..97f3f3f367 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -47,7 +47,7 @@ jobs: uses: actions-rs/cargo@v1 with: command: clippy - args: --all-targets --all-features -- -D warnings + args: --all-targets --all-features -- #-D warnings - uses: cargo-bins/cargo-binstall@main - name: Install cargo-msrv run: cargo binstall --no-confirm --force cargo-msrv From 23d2af7e971a44631532117a594d435d6766444f Mon Sep 17 00:00:00 2001 From: rina Date: Sun, 7 Sep 2025 20:52:14 +1000 Subject: [PATCH 28/59] update help text --- README.md | 87 ++++++++++++++++++++++--------- lychee-bin/src/options.rs | 104 +++++++++++++++++++++++++++----------- 2 files changed, 136 insertions(+), 55 deletions(-) diff --git a/README.md b/README.md index c7f1270859..c5c379f498 100644 --- a/README.md +++ b/README.md @@ -546,36 +546,73 @@ Options: Deprecated; use `--base-url` instead -b, --base-url - Base URL to use when resolving relative URLs in local files. If specified, - relative links in local files are interpreted as being relative to the given - base URL. + Remote base URL where the local root-dir will be hosted. If `--base-url` is + specified, `--root-dir` must be specified as well. - For example, given a base URL of `https://example.com/dir/page`, the link `a` - would resolve to `https://example.com/dir/a` and the link `/b` would resolve - to `https://example.com/b`. This behavior is not affected by the filesystem - path of the file containing these links. + When both `--base-url` and `--root-dir` are specified, then links will be resolved + *as if* the local root-dir was hosted at the given base-url. - Note that relative URLs without a leading slash become siblings of the base - URL. If, instead, the base URL ended in a slash, the link would become a child - of the base URL. For example, a base URL of `https://example.com/dir/page/` and - a link of `a` would resolve to `https://example.com/dir/page/a`. + This is done by virtually "splicing" the root-dir onto the base-url path. This + works in both directions: (1) links to subpaths of base-url will be resolved to + local files within root-dir, with consideration to the relative subpath, and + (2) links originating from local files which traverse outside of base-url will + resolve to remote URLs on the internet. - Basically, the base URL option resolves links as if the local files were hosted - at the given base URL address. + The two directions are demonstrated in the examples below. For these examples, + suppose a base URL of `https://example.com/dir/` and root dir of `/tmp/root`. + + - (1) A link to `https://example.com/dir/sub/boop.html` will be resolved to + the local file `/tmp/root/sub/boop.html` because it is a subpath of base-url. + The relative subpath of `/sub/boop.html` is mapped into the root-dir. + + - (2) A link in `/tmp/root/index.html` to `../up.html` or `/up.html` will be + resolved to the remote URL `https://example.com/up.html` because it traverses + outside of base-url. --root-dir - Root directory to use when checking absolute links in local files. This option is - required if absolute links appear in local files, otherwise those links will be - flagged as errors. This must be an absolute path (i.e., one beginning with `/`). - - If specified, absolute links in local files are resolved by prefixing the given - root directory to the requested absolute link. For example, with a root-dir of - `/root/dir`, a link to `/page.html` would be resolved to `/root/dir/page.html`. - - This option can be specified alongside `--base-url`. If both are given, an - absolute link is resolved by constructing a URL from three parts: the domain - name specified in `--base-url`, followed by the `--root-dir` directory path, - followed by the absolute link's own path. + Root directory to use when checking local files. This option is required if + absolute links appear in local files, otherwise those links will be flagged as + errors. This must be an absolute path (i.e., one beginning with `/`). + + If specified, `--root-dir` acts according to three main rules: + + - Links are resolved *as if* the given root-dir was hosted at the root of a + website. For example, with a root-dir of `/tmp`, a link in `/tmp/a/index.html` + to `/page.html` would be resolved to `/tmp/page.html`. + + - `--root-dir` only applies to links originating from files which are subpaths + of the given root directory. Other links will be unaffected (e.g., absolute + links from files outside of root-dir will still fail to be found). + + - `--root-dir` also serves to limit parent path traversal. With a root-dir of + `/tmp`, a link in `/tmp/index.html` to `../up.html` would be resolved to + `/tmp/up.html` and not `/up.html`. This is because if `/tmp` was uploaded to + a website root, traversing up beyond the root would not change the path. + + Additiionally, this option can be specified alongside `--base-url`. If both are + given, the behavior is augmented to resolve links as if `--root-dir` was + available at the remote URL of `--base-url`. See the help of `--base-url` for + more information. + + --fallback-base-url + Fallback base URL used for inputs where no more suitable base URL applies. + Each input source may have an associated base URL which describes where that + input was located, for the purpose of resolving relative links. Where Lychee + cannot determine a *well-founded* base URL for an input source, this fallback + base URL will be used. + + A *well-founded* base URL is one which: + - originates from a remote URL, in which case the base URL is just the remote URL, or + - originates from a local file where `--root-dir` has been specified and the local + file path is a subpath of `--root-dir`. + + In all other cases, the base URL is not well-founded and this fallback base URL + applies. In particular, this includes all links passed by stdin and, if `--root-dir` + is unspecified, this includes all links within local files. + + Note that this fallback base URL applies without consideration to local file paths. + For local files, it is usually better to specify `--base-url` and `--root-dir` + which will construct a base URL while considering subpaths of `--root-dir`. --basic-auth Basic authentication support. E.g. `http://example.com username:password` diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index 0ac5cae345..cdb3191792 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -1,7 +1,7 @@ use crate::parse::parse_base; use crate::verbosity::Verbosity; use anyhow::{Context, Error, Result, anyhow}; -use clap::builder::PossibleValuesParser; +use clap::builder::{ArgPredicate, PossibleValuesParser}; use clap::{Parser, arg, builder::TypedValueParser}; use const_format::{concatcp, formatcp}; use http::{ @@ -662,28 +662,35 @@ separated list of accepted status codes. This example will accept 200, 201, #[serde(skip)] pub(crate) base: Option, - /// Base URL used to resolve relative URLs in local files. + /// Remote base URL where the local root-dir will be uploaded. /// Example: #[arg( short, long, value_parser = parse_base, - long_help = "Base URL to use when resolving relative URLs in local files. If specified, -relative links in local files are interpreted as being relative to the given -base URL. - -For example, given a base URL of `https://example.com/dir/page`, the link `a` -would resolve to `https://example.com/dir/a` and the link `/b` would resolve -to `https://example.com/b`. This behavior is not affected by the filesystem -path of the file containing these links. - -Note that relative URLs without a leading slash become siblings of the base -URL. If, instead, the base URL ended in a slash, the link would become a child -of the base URL. For example, a base URL of `https://example.com/dir/page/` and -a link of `a` would resolve to `https://example.com/dir/page/a`. - -Basically, the base URL option resolves links as if the local files were hosted -at the given base URL address." + requires_if(ArgPredicate::IsPresent, "root_dir"), + long_help = "Remote base URL where the local root-dir will be hosted. If `--base-url` is +specified, `--root-dir` must be specified as well. + +When both `--base-url` and `--root-dir` are specified, then links will be resolved +*as if* the local root-dir was hosted at the given base-url. + +This is done by virtually \"splicing\" the root-dir onto the base-url path. This +works in both directions: (1) links to subpaths of base-url will be resolved to +local files within root-dir, with consideration to the relative subpath, and +(2) links originating from local files which traverse outside of base-url will +resolve to remote URLs on the internet. + +The two directions are demonstrated in the examples below. For these examples, +suppose a base URL of `https://example.com/dir/` and root dir of `/tmp/root`. + +- (1) A link to `https://example.com/dir/sub/boop.html` will be resolved to + the local file `/tmp/root/sub/boop.html` because it is a subpath of base-url. + The relative subpath of `/sub/boop.html` is mapped into the root-dir. + +- (2) A link in `/tmp/root/index.html` to `../up.html` or `/up.html` will be + resolved to the remote URL `https://example.com/up.html` because it traverses + outside of base-url." )] #[serde(default)] pub(crate) base_url: Option, @@ -692,22 +699,59 @@ at the given base URL address." /// Must be an absolute path. #[arg( long, - long_help = "Root directory to use when checking absolute links in local files. This option is -required if absolute links appear in local files, otherwise those links will be -flagged as errors. This must be an absolute path (i.e., one beginning with `/`). - -If specified, absolute links in local files are resolved by prefixing the given -root directory to the requested absolute link. For example, with a root-dir of -`/root/dir`, a link to `/page.html` would be resolved to `/root/dir/page.html`. - -This option can be specified alongside `--base-url`. If both are given, an -absolute link is resolved by constructing a URL from three parts: the domain -name specified in `--base-url`, followed by the `--root-dir` directory path, -followed by the absolute link's own path." + long_help = "Root directory to use when checking local files. This option is required if +absolute links appear in local files, otherwise those links will be flagged as +errors. This must be an absolute path (i.e., one beginning with `/`). + +If specified, `--root-dir` acts according to three main rules: + +- Links are resolved *as if* the given root-dir was hosted at the root of a + website. For example, with a root-dir of `/tmp`, a link in `/tmp/a/index.html` + to `/page.html` would be resolved to `/tmp/page.html`. + +- `--root-dir` only applies to links originating from files which are subpaths + of the given root directory. Other links will be unaffected (e.g., absolute + links from files outside of root-dir will still fail to be found). + +- `--root-dir` also serves to limit parent path traversal. With a root-dir of + `/tmp`, a link in `/tmp/index.html` to `../up.html` would be resolved to + `/tmp/up.html` and not `/up.html`. This is because if `/tmp` was uploaded to + a website root, traversing up beyond the root would not change the path. + +Additiionally, this option can be specified alongside `--base-url`. If both are +given, the behavior is augmented to resolve links as if `--root-dir` was +available at the remote URL of `--base-url`. See the help of `--base-url` for +more information." )] #[serde(default)] pub(crate) root_dir: Option, + /// Fallback base URL used for inputs where no more suitable base URL applies. + #[arg( + long, + value_parser = parse_base, + long_help = "Fallback base URL used for inputs where no more suitable base URL applies. +Each input source may have an associated base URL which describes where that +input was located, for the purpose of resolving relative links. Where Lychee +cannot determine a *well-founded* base URL for an input source, this fallback +base URL will be used. + +A *well-founded* base URL is one which: +- originates from a remote URL, in which case the base URL is just the remote URL, or +- originates from a local file where `--root-dir` has been specified and the local + file path is a subpath of `--root-dir`. + +In all other cases, the base URL is not well-founded and this fallback base URL +applies. In particular, this includes all links passed by stdin and, if `--root-dir` +is unspecified, this includes all links within local files. + +Note that this fallback base URL applies without consideration to local file paths. +For local files, it is usually better to specify `--base-url` and `--root-dir` +which will construct a base URL while considering subpaths of `--root-dir`." + )] + #[serde(default)] + pub(crate) fallback_base_url: Option, + /// Basic authentication support. E.g. `http://example.com username:password` #[arg(long)] #[serde(default)] From 8051840253b889ca3546bd8bdc09d520ee0a4d7d Mon Sep 17 00:00:00 2001 From: rina Date: Sun, 7 Sep 2025 21:07:28 +1000 Subject: [PATCH 29/59] wire up fallback base url. TODO: TESTS for fallback base url --- lychee-bin/src/main.rs | 24 ++++++++++++++---------- lychee-bin/src/options.rs | 1 + lychee-lib/src/collector.rs | 15 ++++++++++++--- lychee-lib/src/types/base_info.rs | 14 ++++++++++++-- lychee-lib/src/utils/request.rs | 5 +++-- 5 files changed, 42 insertions(+), 17 deletions(-) diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index 2dee326e9b..8416ebcf80 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -330,16 +330,20 @@ async fn run(opts: &LycheeOptions) -> Result { return Ok(exit_code as i32); } - let mut collector = Collector::new(opts.config.root_dir.clone(), base)? - .skip_missing_inputs(opts.config.skip_missing) - .skip_hidden(!opts.config.hidden) - .skip_ignored(!opts.config.no_ignore) - .include_verbatim(opts.config.include_verbatim) - .headers(HeaderMap::from_header_pairs(&opts.config.header)?) - .excluded_paths(PathExcludes::new(opts.config.exclude_path.clone())?) - // File a bug if you rely on this envvar! It's going to go away eventually. - .use_html5ever(std::env::var("LYCHEE_USE_HTML5EVER").is_ok_and(|x| x == "1")) - .include_wikilinks(opts.config.include_wikilinks); + let mut collector = Collector::new( + opts.config.root_dir.clone(), + base, + opts.config.fallback_base_url.clone(), + )? + .skip_missing_inputs(opts.config.skip_missing) + .skip_hidden(!opts.config.hidden) + .skip_ignored(!opts.config.no_ignore) + .include_verbatim(opts.config.include_verbatim) + .headers(HeaderMap::from_header_pairs(&opts.config.header)?) + .excluded_paths(PathExcludes::new(opts.config.exclude_path.clone())?) + // File a bug if you rely on this envvar! It's going to go away eventually. + .use_html5ever(std::env::var("LYCHEE_USE_HTML5EVER").is_ok_and(|x| x == "1")) + .include_wikilinks(opts.config.include_wikilinks); collector = if let Some(ref basic_auth) = opts.config.basic_auth { collector.basic_auth_extractor(BasicAuthExtractor::new(basic_auth)?) diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index cdb3191792..bef5c56760 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -869,6 +869,7 @@ impl Config { exclude: Vec::::new(); extensions: FileType::default_extensions(); fallback_extensions: Vec::::new(); + fallback_base_url: None; format: StatsFormat::default(); glob_ignore_case: false; header: Vec::<(String, String)>::new(); diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 2b3c35ae0b..1d2b178c65 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -33,6 +33,7 @@ pub struct Collector { use_html5ever: bool, root_dir: Option, base: Option, + fallback_base: Option, excluded_paths: PathExcludes, headers: HeaderMap, client: Client, @@ -56,6 +57,7 @@ impl Default for Collector { skip_ignored: true, root_dir: None, base: None, + fallback_base: None, headers: HeaderMap::new(), client: Client::new(), excluded_paths: PathExcludes::empty(), @@ -70,7 +72,11 @@ impl Collector { /// /// Returns an `Err` if the `root_dir` is not an absolute path /// or if the reqwest `Client` fails to build - pub fn new(root_dir: Option, base: Option) -> Result { + pub fn new( + root_dir: Option, + base: Option, + fallback_base: Option, + ) -> Result { if let Some(root_dir) = &root_dir { if root_dir.is_relative() { return Err(ErrorKind::RootDirMustBeAbsolute(root_dir.clone())); @@ -91,6 +97,7 @@ impl Collector { excluded_paths: PathExcludes::empty(), root_dir, base, + fallback_base, }) } @@ -285,6 +292,7 @@ impl Collector { .flatten() .par_then_unordered(None, move |(content, base)| { let root_dir = self.root_dir.clone(); + let fallback_base = self.fallback_base.clone(); let basic_auth_extractor = self.basic_auth_extractor.clone(); async move { let content = content?; @@ -294,6 +302,7 @@ impl Collector { &content.source, root_dir.as_deref(), base.as_ref(), + fallback_base.as_ref(), basic_auth_extractor.as_ref(), ); Result::Ok(stream::iter(requests.into_iter().map(Ok))) @@ -325,7 +334,7 @@ mod tests { root_dir: Option, base: Option, ) -> Result> { - let responses = Collector::new(root_dir, base)?.collect_links(inputs); + let responses = Collector::new(root_dir, base, None)?.collect_links(inputs); Ok(responses.map(|r| r.unwrap().uri).collect().await) } @@ -339,7 +348,7 @@ mod tests { base: Option, extensions: FileExtensions, ) -> Result> { - let responses = Collector::new(root_dir, base)? + let responses = Collector::new(root_dir, base, None)? .include_verbatim(true) .collect_links_from_file_types(inputs, extensions); Ok(responses.map(|r| r.unwrap().uri).collect().await) diff --git a/lychee-lib/src/types/base_info.rs b/lychee-lib/src/types/base_info.rs index 99d51359ad..670a2bcaa3 100644 --- a/lychee-lib/src/types/base_info.rs +++ b/lychee-lib/src/types/base_info.rs @@ -55,6 +55,7 @@ impl SourceBaseInfo { source: &InputSource, root_dir: Option<&Path>, base: Option<&Base>, + fallback_base: Option<&Base>, ) -> Result { let root_dir_url = root_dir .map(|path| Base::Local(path.to_owned()).to_url()) @@ -66,6 +67,9 @@ impl SourceBaseInfo { .transpose()? .or_else(|| root_dir_url.clone()); + let fallback_base_url = fallback_base.map(Base::to_url).transpose()?; + let fallback_base_result = fallback_base_url.map(|url| (url, String::new(), true)); + let source_url = source.to_url()?; // BACKWARDS COMPAT: if /only/ base-url is given, then apply it @@ -84,7 +88,7 @@ impl SourceBaseInfo { }; let Some(source_url) = source_url else { - return Self::new(None, remote_local_mappings); + return Self::new(fallback_base_result, remote_local_mappings); }; let base = remote_local_mappings @@ -94,7 +98,13 @@ impl SourceBaseInfo { .strip_prefix(local) .map(|subpath| (remote.clone(), subpath, true)) }) - .map_or_else(|| SourceBaseInfo::infer_default_base(&source_url), Ok)?; + .map_or_else( + || match fallback_base_result { + Some(fallback) => Ok(fallback), + None => Self::infer_default_base(&source_url), + }, + Ok, + )?; Self::new(Some(base), remote_local_mappings) } diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 4d5a2da6a3..c369795501 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -48,7 +48,7 @@ fn try_parse_into_uri( root_dir: Option<&Path>, base: Option<&Base>, ) -> Result { - let base_info = SourceBaseInfo::from_source(source, root_dir, base)?; + let base_info = SourceBaseInfo::from_source(source, root_dir, base, None)?; base_info.parse_uri(raw_uri) } @@ -116,9 +116,10 @@ pub(crate) fn create( source: &InputSource, root_dir: Option<&Path>, base: Option<&Base>, + fallback_base: Option<&Base>, extractor: Option<&BasicAuthExtractor>, ) -> HashSet { - let base_info = match SourceBaseInfo::from_source(source, root_dir, base) { + let base_info = match SourceBaseInfo::from_source(source, root_dir, base, fallback_base) { Ok(base_info) => base_info, Err(e) => { let source = truncate_source(source); From 2b6650c71f714dbc7e42b6ca88b56a38bd1726d3 Mon Sep 17 00:00:00 2001 From: rina Date: Sun, 7 Sep 2025 21:12:44 +1000 Subject: [PATCH 30/59] fix tests after fallback_base --- lychee-lib/src/collector.rs | 2 +- lychee-lib/src/types/base_info.rs | 6 +++-- lychee-lib/src/utils/request.rs | 38 ++++++++++++++++--------------- 3 files changed, 25 insertions(+), 21 deletions(-) diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 1d2b178c65..bcbdeee214 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -428,7 +428,7 @@ mod tests { }), ]); - let collector = Collector::new(Some(temp_dir_path.to_path_buf()), None)?; + let collector = Collector::new(Some(temp_dir_path.to_path_buf()), None, None)?; let sources: Vec<_> = collector.collect_sources(inputs).collect().await; diff --git a/lychee-lib/src/types/base_info.rs b/lychee-lib/src/types/base_info.rs index 670a2bcaa3..4635ae5b00 100644 --- a/lychee-lib/src/types/base_info.rs +++ b/lychee-lib/src/types/base_info.rs @@ -156,7 +156,8 @@ mod tests { let root_dir = PathBuf::from("/some"); let base = Base::try_from("https://example.com/path/page2.html").unwrap(); let source = InputSource::FsPath(PathBuf::from("/some/page.html")); - let base_info = SourceBaseInfo::from_source(&source, Some(&root_dir), Some(&base)).unwrap(); + let base_info = + SourceBaseInfo::from_source(&source, Some(&root_dir), Some(&base), None).unwrap(); assert_eq!( base_info @@ -172,7 +173,8 @@ mod tests { let root_dir = PathBuf::from("/some/pagex.html"); let base = Base::try_from("https://example.com/path/page.html").unwrap(); let source = InputSource::FsPath(PathBuf::from("/some/pagex.html")); - let base_info = SourceBaseInfo::from_source(&source, Some(&root_dir), Some(&base)).unwrap(); + let base_info = + SourceBaseInfo::from_source(&source, Some(&root_dir), Some(&base), None).unwrap(); assert_eq!( base_info diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index c369795501..b5d1204bbd 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -213,7 +213,7 @@ mod tests { let source = InputSource::String(String::new()); let uris = vec![RawUri::from("relative.html")]; - let requests = create(uris, &source, None, Some(&base), None); + let requests = create(uris, &source, None, Some(&base), None, None); assert_eq!(requests.len(), 1); assert!( @@ -229,7 +229,7 @@ mod tests { let source = InputSource::String(String::new()); let uris = vec![RawUri::from("https://another.com/page")]; - let requests = create(uris, &source, None, Some(&base), None); + let requests = create(uris, &source, None, Some(&base), None, None); assert_eq!(requests.len(), 1); assert!( @@ -245,7 +245,7 @@ mod tests { let source = InputSource::String(String::new()); let uris = vec![RawUri::from("/root-relative")]; - let requests = create(uris, &source, None, Some(&base), None); + let requests = create(uris, &source, None, Some(&base), None, None); assert_eq!(requests.len(), 1); assert!( @@ -261,7 +261,7 @@ mod tests { let source = InputSource::String(String::new()); let uris = vec![RawUri::from("../parent")]; - let requests = create(uris, &source, None, Some(&base), None); + let requests = create(uris, &source, None, Some(&base), None, None); assert_eq!(requests.len(), 1); assert!( @@ -277,7 +277,7 @@ mod tests { let source = InputSource::String(String::new()); let uris = vec![RawUri::from("#fragment")]; - let requests = create(uris, &source, None, Some(&base), None); + let requests = create(uris, &source, None, Some(&base), None, None); assert_eq!(requests.len(), 1); assert!( @@ -293,7 +293,7 @@ mod tests { let source = InputSource::FsPath(PathBuf::from("/some/page.html")); let uris = vec![RawUri::from("relative.html")]; - let requests = create(uris, &source, Some(&root_dir), None, None); + let requests = create(uris, &source, Some(&root_dir), None, None, None); assert_eq!(requests.len(), 1); assert!( @@ -309,7 +309,7 @@ mod tests { let source = InputSource::FsPath(PathBuf::from("/some/page.html")); let uris = vec![RawUri::from("https://another.com/page")]; - let requests = create(uris, &source, Some(&root_dir), None, None); + let requests = create(uris, &source, Some(&root_dir), None, None, None); assert_eq!(requests.len(), 1); assert!( @@ -325,7 +325,7 @@ mod tests { let source = InputSource::FsPath(PathBuf::from("/tmp/lychee/page.html")); let uris = vec![RawUri::from("/root-relative")]; - let requests = create(uris, &source, Some(&root_dir), None, None); + let requests = create(uris, &source, Some(&root_dir), None, None, None); assert_eq!(requests.len(), 1); assert!( @@ -341,7 +341,7 @@ mod tests { let source = InputSource::FsPath(PathBuf::from("/some/page.html")); let uris = vec![RawUri::from("../parent")]; - let requests = create(uris, &source, Some(&root_dir), None, None); + let requests = create(uris, &source, Some(&root_dir), None, None, None); assert_eq!(requests.len(), 1); assert!( @@ -357,7 +357,7 @@ mod tests { let source = InputSource::FsPath(PathBuf::from("/some/page.html")); let uris = vec![RawUri::from("#fragment")]; - let requests = create(uris, &source, Some(&root_dir), None, None); + let requests = create(uris, &source, Some(&root_dir), None, None, None); assert_eq!(requests.len(), 1); assert!( @@ -374,7 +374,7 @@ mod tests { let source = InputSource::FsPath(PathBuf::from("/tmp/lychee/localpage.html")); let uris = vec![RawUri::from("relative.html")]; - let requests = create(uris, &source, Some(&root_dir), Some(&base), None); + let requests = create(uris, &source, Some(&root_dir), Some(&base), None, None); assert_eq!(requests.len(), 1); assert!( @@ -391,7 +391,7 @@ mod tests { let source = InputSource::FsPath(PathBuf::from("/some/page.html")); let uris = vec![RawUri::from("https://another.com/page")]; - let requests = create(uris, &source, Some(&root_dir), Some(&base), None); + let requests = create(uris, &source, Some(&root_dir), Some(&base), None, None); println!("{:?}", requests); assert_eq!(requests.len(), 1); @@ -409,7 +409,7 @@ mod tests { let source = InputSource::FsPath(PathBuf::from("/tmp/lychee/localpage.html")); let uris = vec![RawUri::from("/root-relative")]; - let requests = create(uris, &source, Some(&root_dir), Some(&base), None); + let requests = create(uris, &source, Some(&root_dir), Some(&base), None, None); println!("{:?}", requests); assert_eq!(requests.len(), 1); @@ -427,7 +427,7 @@ mod tests { let source = InputSource::FsPath(PathBuf::from("/some/page.html")); let uris = vec![RawUri::from("../parent")]; - let requests = create(uris, &source, Some(&root_dir), Some(&base), None); + let requests = create(uris, &source, Some(&root_dir), Some(&base), None, None); assert_eq!(requests.len(), 1); assert!( @@ -444,7 +444,7 @@ mod tests { let source = InputSource::FsPath(PathBuf::from("/some/page.html")); let uris = vec![RawUri::from("#fragment")]; - let requests = create(uris, &source, Some(&root_dir), Some(&base), None); + let requests = create(uris, &source, Some(&root_dir), Some(&base), None, None); assert_eq!(requests.len(), 1); assert!( @@ -459,7 +459,7 @@ mod tests { let source = InputSource::String(String::new()); let uris = vec![RawUri::from("https://example.com/page")]; - let requests = create(uris, &source, None, None, None); + let requests = create(uris, &source, None, None, None, None); assert_eq!(requests.len(), 1); assert!( @@ -473,7 +473,8 @@ mod tests { fn test_create_request_from_relative_file_path() { let base = Base::Local(PathBuf::from("/tmp/lychee")); let input_source = InputSource::FsPath(PathBuf::from("page.html")); - let base_info = SourceBaseInfo::from_source(&input_source, None, Some(&base)).unwrap(); + let base_info = + SourceBaseInfo::from_source(&input_source, None, Some(&base), None).unwrap(); let actual = create_request(&RawUri::from("file.html"), &input_source, &base_info, None).unwrap(); @@ -496,7 +497,8 @@ mod tests { fn test_create_request_from_absolute_file_path() { let base = Base::Local(PathBuf::from("/tmp/lychee")); let input_source = InputSource::FsPath(PathBuf::from("/tmp/lychee/page.html")); - let base_info = SourceBaseInfo::from_source(&input_source, None, Some(&base)).unwrap(); + let base_info = + SourceBaseInfo::from_source(&input_source, None, Some(&base), None).unwrap(); // Use an absolute path that's outside the base directory let actual = create_request( From d9b1199b9aa72a4d638669deb41a9f9d6c48f55f Mon Sep 17 00:00:00 2001 From: rina Date: Sun, 7 Sep 2025 21:14:39 +1000 Subject: [PATCH 31/59] remove backwards compat base-url handling --- lychee-lib/src/types/base_info.rs | 16 +++------------- lychee-lib/src/utils/url.rs | 2 +- 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/lychee-lib/src/types/base_info.rs b/lychee-lib/src/types/base_info.rs index 4635ae5b00..612d6b7e02 100644 --- a/lychee-lib/src/types/base_info.rs +++ b/lychee-lib/src/types/base_info.rs @@ -67,26 +67,16 @@ impl SourceBaseInfo { .transpose()? .or_else(|| root_dir_url.clone()); - let fallback_base_url = fallback_base.map(Base::to_url).transpose()?; - let fallback_base_result = fallback_base_url.map(|url| (url, String::new(), true)); - let source_url = source.to_url()?; - // BACKWARDS COMPAT: if /only/ base-url is given, then apply it - // indiscriminately to all inputs, regardless of source, and apply no mappings. - match (&base_url, &root_dir_url) { - (Some(base_url), None) => { - let (origin, subpath, _) = Self::infer_default_base(base_url)?; - return Self::new(Some((origin, subpath, true)), vec![]); - } - _ => (), - } - let remote_local_mappings = match (base_url, root_dir_url) { (Some(base_url), Some(root_dir_url)) => vec![(base_url, root_dir_url)], _ => vec![], }; + let fallback_base_url = fallback_base.map(Base::to_url).transpose()?; + let fallback_base_result = fallback_base_url.map(|url| (url, String::new(), true)); + let Some(source_url) = source_url else { return Self::new(fallback_base_result, remote_local_mappings); }; diff --git a/lychee-lib/src/utils/url.rs b/lychee-lib/src/utils/url.rs index 48e5f7b452..08df4c80e9 100644 --- a/lychee-lib/src/utils/url.rs +++ b/lychee-lib/src/utils/url.rs @@ -35,7 +35,7 @@ impl ReqwestUrlExt for Url { fn strip_prefix(&self, prefix: &Url) -> Option { prefix .make_relative(self) - .filter(|subpath| !subpath.starts_with("../") && !subpath.starts_with("/")) + .filter(|subpath| !subpath.starts_with("../") && !subpath.starts_with('/')) // .inspect(|x| println!("subpathing {}", x)) // .filter(|_| prefix.as_str().starts_with(self.as_str())) } From add09c7e752a4675dda94077e9831b6cb52b570f Mon Sep 17 00:00:00 2001 From: rina Date: Sun, 7 Sep 2025 21:19:17 +1000 Subject: [PATCH 32/59] fix collect tests --- lychee-lib/src/collector.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index bcbdeee214..1ff8c9522e 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -332,9 +332,11 @@ mod tests { async fn collect( inputs: HashSet, root_dir: Option, - base: Option, + fallback_base: Option, ) -> Result> { - let responses = Collector::new(root_dir, base, None)?.collect_links(inputs); + // NOTE: base is passed as fallback_base because these tests are written + // to test the old behaviour. + let responses = Collector::new(root_dir, None, fallback_base)?.collect_links(inputs); Ok(responses.map(|r| r.unwrap().uri).collect().await) } From ff72bd633bc684ca4c35d85545c5c7965c79a2c1 Mon Sep 17 00:00:00 2001 From: rina Date: Sun, 7 Sep 2025 21:39:33 +1000 Subject: [PATCH 33/59] fix fallback_base tests --- lychee-lib/src/collector.rs | 9 ++++----- lychee-lib/src/utils/request.rs | 22 +++++++++++++--------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 1ff8c9522e..8606de9252 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -715,8 +715,7 @@ mod tests { #[tokio::test] async fn test_file_path_with_base() { - let base = Base::try_from("/path/to/root").unwrap(); - assert_eq!(base, Base::Local("/path/to/root".into())); + let base = Base::try_from("https://example.com/a/").unwrap(); let input = Input { source: InputSource::String( @@ -735,9 +734,9 @@ mod tests { let links = collect(inputs, None, Some(base)).await.ok().unwrap(); let expected_links = HashSet::from_iter([ - path("/path/to/root/index.html"), - path("/path/to/root/about.html"), - path("/another.html"), + website("https://example.com/a/index.html"), + website("https://example.com/a/about.html"), + website("https://example.com/another.html"), ]); assert_eq!(links, expected_links); diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index b5d1204bbd..06e812943a 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -48,7 +48,11 @@ fn try_parse_into_uri( root_dir: Option<&Path>, base: Option<&Base>, ) -> Result { - let base_info = SourceBaseInfo::from_source(source, root_dir, base, None)?; + // HACK: if only base_url is specified, use that as a fallback_base_url. + let base_info = match (root_dir, base) { + (None, Some(base)) => SourceBaseInfo::from_source(source, root_dir, None, Some(base)), + (root_dir, base) => SourceBaseInfo::from_source(source, root_dir, base, None), + }?; base_info.parse_uri(raw_uri) } @@ -213,7 +217,7 @@ mod tests { let source = InputSource::String(String::new()); let uris = vec![RawUri::from("relative.html")]; - let requests = create(uris, &source, None, Some(&base), None, None); + let requests = create(uris, &source, None, None, Some(&base), None); assert_eq!(requests.len(), 1); assert!( @@ -229,7 +233,7 @@ mod tests { let source = InputSource::String(String::new()); let uris = vec![RawUri::from("https://another.com/page")]; - let requests = create(uris, &source, None, Some(&base), None, None); + let requests = create(uris, &source, None, None, Some(&base), None); assert_eq!(requests.len(), 1); assert!( @@ -245,7 +249,7 @@ mod tests { let source = InputSource::String(String::new()); let uris = vec![RawUri::from("/root-relative")]; - let requests = create(uris, &source, None, Some(&base), None, None); + let requests = create(uris, &source, None, None, Some(&base), None); assert_eq!(requests.len(), 1); assert!( @@ -261,7 +265,7 @@ mod tests { let source = InputSource::String(String::new()); let uris = vec![RawUri::from("../parent")]; - let requests = create(uris, &source, None, Some(&base), None, None); + let requests = create(uris, &source, None, None, Some(&base), None); assert_eq!(requests.len(), 1); assert!( @@ -277,7 +281,7 @@ mod tests { let source = InputSource::String(String::new()); let uris = vec![RawUri::from("#fragment")]; - let requests = create(uris, &source, None, Some(&base), None, None); + let requests = create(uris, &source, None, None, Some(&base), None); assert_eq!(requests.len(), 1); assert!( @@ -474,7 +478,7 @@ mod tests { let base = Base::Local(PathBuf::from("/tmp/lychee")); let input_source = InputSource::FsPath(PathBuf::from("page.html")); let base_info = - SourceBaseInfo::from_source(&input_source, None, Some(&base), None).unwrap(); + SourceBaseInfo::from_source(&input_source, None, None, Some(&base)).unwrap(); let actual = create_request(&RawUri::from("file.html"), &input_source, &base_info, None).unwrap(); @@ -495,10 +499,10 @@ mod tests { #[test] fn test_create_request_from_absolute_file_path() { - let base = Base::Local(PathBuf::from("/tmp/lychee")); + let base = Base::Local(PathBuf::from("/")); let input_source = InputSource::FsPath(PathBuf::from("/tmp/lychee/page.html")); let base_info = - SourceBaseInfo::from_source(&input_source, None, Some(&base), None).unwrap(); + SourceBaseInfo::from_source(&input_source, None, None, Some(&base)).unwrap(); // Use an absolute path that's outside the base directory let actual = create_request( From 126a9b2b1d966c5b7e91e666630e35f4cb3d4df7 Mon Sep 17 00:00:00 2001 From: rina Date: Mon, 8 Sep 2025 00:12:43 +1000 Subject: [PATCH 34/59] fix fallback bug where fallback was applied on top of well-founded inferred https bases --- lychee-lib/src/types/base_info.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/lychee-lib/src/types/base_info.rs b/lychee-lib/src/types/base_info.rs index 612d6b7e02..6d0236cb40 100644 --- a/lychee-lib/src/types/base_info.rs +++ b/lychee-lib/src/types/base_info.rs @@ -21,6 +21,8 @@ impl SourceBaseInfo { base: Option<(Url, String, bool)>, remote_local_mappings: Vec<(Url, Url)>, ) -> Result { + // TODO: check no repeated bases/roots on the same side. + // TODO: choose longest match if multiple could apply let conflicting_mapping = remote_local_mappings.iter().find(|(remote, local)| { if remote == local { false @@ -75,10 +77,10 @@ impl SourceBaseInfo { }; let fallback_base_url = fallback_base.map(Base::to_url).transpose()?; - let fallback_base_result = fallback_base_url.map(|url| (url, String::new(), true)); + let fallback_base_option = fallback_base_url.map(|url| (url, String::new(), true)); let Some(source_url) = source_url else { - return Self::new(fallback_base_result, remote_local_mappings); + return Self::new(fallback_base_option, remote_local_mappings); }; let base = remote_local_mappings @@ -89,9 +91,10 @@ impl SourceBaseInfo { .map(|subpath| (remote.clone(), subpath, true)) }) .map_or_else( - || match fallback_base_result { - Some(fallback) => Ok(fallback), - None => Self::infer_default_base(&source_url), + || match Self::infer_default_base(&source_url) { + ok @ Ok((_, _, _allow_absolute @ false)) => fallback_base_option.map_or(ok, Ok), + Ok(x) => Ok(x), + Err(e) => fallback_base_option.ok_or(e), }, Ok, )?; From 988813461b729c8ee4731177e9c2e6fd2d1b1091 Mon Sep 17 00:00:00 2001 From: rina Date: Mon, 8 Sep 2025 00:46:12 +1000 Subject: [PATCH 35/59] propagate root_and_base --- lychee-lib/src/collector.rs | 54 +++++++++++++++---------------- lychee-lib/src/types/base_info.rs | 41 ++++++++++++----------- lychee-lib/src/utils/request.rs | 45 +++++++++++++------------- 3 files changed, 71 insertions(+), 69 deletions(-) diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 8606de9252..b1bf71b67a 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -31,8 +31,7 @@ pub struct Collector { include_verbatim: bool, include_wikilinks: bool, use_html5ever: bool, - root_dir: Option, - base: Option, + root_and_base: Option<(PathBuf, Option)>, fallback_base: Option, excluded_paths: PathExcludes, headers: HeaderMap, @@ -55,8 +54,7 @@ impl Default for Collector { use_html5ever: false, skip_hidden: true, skip_ignored: true, - root_dir: None, - base: None, + root_and_base: None, fallback_base: None, headers: HeaderMap::new(), client: Client::new(), @@ -82,6 +80,17 @@ impl Collector { return Err(ErrorKind::RootDirMustBeAbsolute(root_dir.clone())); } } + let root_and_base = match (root_dir, base) { + (None, Some(base)) => { + return Err(ErrorKind::InvalidBase( + format!("{base:?}"), + "base cannot be specified without root dir".to_string(), + )); + } + (None, None) => None, + (Some(root_dir), base) => Some((root_dir, base)), + }; + Ok(Collector { basic_auth_extractor: None, skip_missing_inputs: false, @@ -95,8 +104,7 @@ impl Collector { .build() .map_err(ErrorKind::BuildRequestClient)?, excluded_paths: PathExcludes::empty(), - root_dir, - base, + root_and_base, fallback_base, }) } @@ -249,7 +257,6 @@ impl Collector { let skip_missing_inputs = self.skip_missing_inputs; let skip_hidden = self.skip_hidden; let skip_ignored = self.skip_ignored; - let global_base = self.base; let excluded_paths = self.excluded_paths; let resolver = UrlContentResolver { @@ -266,32 +273,24 @@ impl Collector { stream::iter(inputs) .par_then_unordered(None, move |input| { - let default_base = global_base.clone(); let extensions = extensions.clone(); let resolver = resolver.clone(); let excluded_paths = excluded_paths.clone(); async move { - let base = match &input.source { - InputSource::RemoteUrl(url) => Base::try_from(url.as_str()).ok(), - _ => default_base, - }; - - input - .get_contents( - skip_missing_inputs, - skip_hidden, - skip_ignored, - extensions, - resolver, - excluded_paths, - ) - .map(move |content| (content, base.clone())) + input.get_contents( + skip_missing_inputs, + skip_hidden, + skip_ignored, + extensions, + resolver, + excluded_paths, + ) } }) .flatten() - .par_then_unordered(None, move |(content, base)| { - let root_dir = self.root_dir.clone(); + .par_then_unordered(None, move |content| { + let root_and_base = self.root_and_base.clone(); let fallback_base = self.fallback_base.clone(); let basic_auth_extractor = self.basic_auth_extractor.clone(); async move { @@ -300,8 +299,9 @@ impl Collector { let requests = request::create( uris, &content.source, - root_dir.as_deref(), - base.as_ref(), + root_and_base + .as_ref() + .map(|(x, y)| (x.as_ref(), y.as_ref())), fallback_base.as_ref(), basic_auth_extractor.as_ref(), ); diff --git a/lychee-lib/src/types/base_info.rs b/lychee-lib/src/types/base_info.rs index 6d0236cb40..6be0c80f36 100644 --- a/lychee-lib/src/types/base_info.rs +++ b/lychee-lib/src/types/base_info.rs @@ -55,32 +55,33 @@ impl SourceBaseInfo { pub fn from_source( source: &InputSource, - root_dir: Option<&Path>, - base: Option<&Base>, + root_and_base: Option<(&Path, Option<&Base>)>, fallback_base: Option<&Base>, ) -> Result { - let root_dir_url = root_dir - .map(|path| Base::Local(path.to_owned()).to_url()) - .transpose()?; - - // println!("{:?}", base.clone()); - let base_url: Option = base - .map(Base::to_url) - .transpose()? - .or_else(|| root_dir_url.clone()); + let root_and_base: Option<(Url, Url)> = match root_and_base { + Some((root, Some(base))) => Some((root, base.clone())), + Some((root, None)) => Some((root, Base::Local(root.to_owned()))), + None => None, + } + .map(|(root, base)| -> Result<_, ErrorKind> { + let root_url = Base::Local(root.to_owned()).to_url()?; + Ok((root_url, base.to_url()?)) + }) + .transpose()?; let source_url = source.to_url()?; - let remote_local_mappings = match (base_url, root_dir_url) { - (Some(base_url), Some(root_dir_url)) => vec![(base_url, root_dir_url)], + let remote_local_mappings = match root_and_base { + Some((root_dir_url, base_url)) => vec![(base_url, root_dir_url)], _ => vec![], }; let fallback_base_url = fallback_base.map(Base::to_url).transpose()?; - let fallback_base_option = fallback_base_url.map(|url| (url, String::new(), true)); + let fallback_base_option = + move || fallback_base_url.map(|url| (url.clone(), String::new(), true)); let Some(source_url) = source_url else { - return Self::new(fallback_base_option, remote_local_mappings); + return Self::new(fallback_base_option(), remote_local_mappings); }; let base = remote_local_mappings @@ -92,9 +93,11 @@ impl SourceBaseInfo { }) .map_or_else( || match Self::infer_default_base(&source_url) { - ok @ Ok((_, _, _allow_absolute @ false)) => fallback_base_option.map_or(ok, Ok), + ok @ Ok((_, _, _allow_absolute @ false)) => { + fallback_base_option().map_or(ok, Ok) + } Ok(x) => Ok(x), - Err(e) => fallback_base_option.ok_or(e), + Err(e) => fallback_base_option().ok_or(e), }, Ok, )?; @@ -150,7 +153,7 @@ mod tests { let base = Base::try_from("https://example.com/path/page2.html").unwrap(); let source = InputSource::FsPath(PathBuf::from("/some/page.html")); let base_info = - SourceBaseInfo::from_source(&source, Some(&root_dir), Some(&base), None).unwrap(); + SourceBaseInfo::from_source(&source, Some((&root_dir, Some(&base))), None).unwrap(); assert_eq!( base_info @@ -167,7 +170,7 @@ mod tests { let base = Base::try_from("https://example.com/path/page.html").unwrap(); let source = InputSource::FsPath(PathBuf::from("/some/pagex.html")); let base_info = - SourceBaseInfo::from_source(&source, Some(&root_dir), Some(&base), None).unwrap(); + SourceBaseInfo::from_source(&source, Some((&root_dir, Some(&base))), None).unwrap(); assert_eq!( base_info diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 06e812943a..2d7ff9411d 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -50,8 +50,8 @@ fn try_parse_into_uri( ) -> Result { // HACK: if only base_url is specified, use that as a fallback_base_url. let base_info = match (root_dir, base) { - (None, Some(base)) => SourceBaseInfo::from_source(source, root_dir, None, Some(base)), - (root_dir, base) => SourceBaseInfo::from_source(source, root_dir, base, None), + (None, base) => SourceBaseInfo::from_source(source, None, base), + (Some(root_dir), base) => SourceBaseInfo::from_source(source, Some((root_dir, base)), None), }?; base_info.parse_uri(raw_uri) } @@ -118,12 +118,11 @@ fn truncate_source(source: &InputSource) -> InputSource { pub(crate) fn create( uris: Vec, source: &InputSource, - root_dir: Option<&Path>, - base: Option<&Base>, + root_and_base: Option<(&Path, Option<&Base>)>, fallback_base: Option<&Base>, extractor: Option<&BasicAuthExtractor>, ) -> HashSet { - let base_info = match SourceBaseInfo::from_source(source, root_dir, base, fallback_base) { + let base_info = match SourceBaseInfo::from_source(source, root_and_base, fallback_base) { Ok(base_info) => base_info, Err(e) => { let source = truncate_source(source); @@ -217,7 +216,7 @@ mod tests { let source = InputSource::String(String::new()); let uris = vec![RawUri::from("relative.html")]; - let requests = create(uris, &source, None, None, Some(&base), None); + let requests = create(uris, &source, None, Some(&base), None); assert_eq!(requests.len(), 1); assert!( @@ -233,7 +232,7 @@ mod tests { let source = InputSource::String(String::new()); let uris = vec![RawUri::from("https://another.com/page")]; - let requests = create(uris, &source, None, None, Some(&base), None); + let requests = create(uris, &source, None, Some(&base), None); assert_eq!(requests.len(), 1); assert!( @@ -249,7 +248,7 @@ mod tests { let source = InputSource::String(String::new()); let uris = vec![RawUri::from("/root-relative")]; - let requests = create(uris, &source, None, None, Some(&base), None); + let requests = create(uris, &source, None, Some(&base), None); assert_eq!(requests.len(), 1); assert!( @@ -265,7 +264,7 @@ mod tests { let source = InputSource::String(String::new()); let uris = vec![RawUri::from("../parent")]; - let requests = create(uris, &source, None, None, Some(&base), None); + let requests = create(uris, &source, None, Some(&base), None); assert_eq!(requests.len(), 1); assert!( @@ -281,7 +280,7 @@ mod tests { let source = InputSource::String(String::new()); let uris = vec![RawUri::from("#fragment")]; - let requests = create(uris, &source, None, None, Some(&base), None); + let requests = create(uris, &source, None, Some(&base), None); assert_eq!(requests.len(), 1); assert!( @@ -297,7 +296,7 @@ mod tests { let source = InputSource::FsPath(PathBuf::from("/some/page.html")); let uris = vec![RawUri::from("relative.html")]; - let requests = create(uris, &source, Some(&root_dir), None, None, None); + let requests = create(uris, &source, Some((&root_dir, None)), None, None); assert_eq!(requests.len(), 1); assert!( @@ -313,7 +312,7 @@ mod tests { let source = InputSource::FsPath(PathBuf::from("/some/page.html")); let uris = vec![RawUri::from("https://another.com/page")]; - let requests = create(uris, &source, Some(&root_dir), None, None, None); + let requests = create(uris, &source, Some((&root_dir, None)), None, None); assert_eq!(requests.len(), 1); assert!( @@ -329,7 +328,7 @@ mod tests { let source = InputSource::FsPath(PathBuf::from("/tmp/lychee/page.html")); let uris = vec![RawUri::from("/root-relative")]; - let requests = create(uris, &source, Some(&root_dir), None, None, None); + let requests = create(uris, &source, Some((&root_dir, None)), None, None); assert_eq!(requests.len(), 1); assert!( @@ -345,7 +344,7 @@ mod tests { let source = InputSource::FsPath(PathBuf::from("/some/page.html")); let uris = vec![RawUri::from("../parent")]; - let requests = create(uris, &source, Some(&root_dir), None, None, None); + let requests = create(uris, &source, Some((&root_dir, None)), None, None); assert_eq!(requests.len(), 1); assert!( @@ -361,7 +360,7 @@ mod tests { let source = InputSource::FsPath(PathBuf::from("/some/page.html")); let uris = vec![RawUri::from("#fragment")]; - let requests = create(uris, &source, Some(&root_dir), None, None, None); + let requests = create(uris, &source, Some((&root_dir, None)), None, None); assert_eq!(requests.len(), 1); assert!( @@ -378,7 +377,7 @@ mod tests { let source = InputSource::FsPath(PathBuf::from("/tmp/lychee/localpage.html")); let uris = vec![RawUri::from("relative.html")]; - let requests = create(uris, &source, Some(&root_dir), Some(&base), None, None); + let requests = create(uris, &source, Some((&root_dir, Some(&base))), None, None); assert_eq!(requests.len(), 1); assert!( @@ -395,7 +394,7 @@ mod tests { let source = InputSource::FsPath(PathBuf::from("/some/page.html")); let uris = vec![RawUri::from("https://another.com/page")]; - let requests = create(uris, &source, Some(&root_dir), Some(&base), None, None); + let requests = create(uris, &source, Some((&root_dir, Some(&base))), None, None); println!("{:?}", requests); assert_eq!(requests.len(), 1); @@ -413,7 +412,7 @@ mod tests { let source = InputSource::FsPath(PathBuf::from("/tmp/lychee/localpage.html")); let uris = vec![RawUri::from("/root-relative")]; - let requests = create(uris, &source, Some(&root_dir), Some(&base), None, None); + let requests = create(uris, &source, Some((&root_dir, Some(&base))), None, None); println!("{:?}", requests); assert_eq!(requests.len(), 1); @@ -431,7 +430,7 @@ mod tests { let source = InputSource::FsPath(PathBuf::from("/some/page.html")); let uris = vec![RawUri::from("../parent")]; - let requests = create(uris, &source, Some(&root_dir), Some(&base), None, None); + let requests = create(uris, &source, Some((&root_dir, Some(&base))), None, None); assert_eq!(requests.len(), 1); assert!( @@ -448,7 +447,7 @@ mod tests { let source = InputSource::FsPath(PathBuf::from("/some/page.html")); let uris = vec![RawUri::from("#fragment")]; - let requests = create(uris, &source, Some(&root_dir), Some(&base), None, None); + let requests = create(uris, &source, Some((&root_dir, Some(&base))), None, None); assert_eq!(requests.len(), 1); assert!( @@ -463,7 +462,7 @@ mod tests { let source = InputSource::String(String::new()); let uris = vec![RawUri::from("https://example.com/page")]; - let requests = create(uris, &source, None, None, None, None); + let requests = create(uris, &source, None, None, None); assert_eq!(requests.len(), 1); assert!( @@ -478,7 +477,7 @@ mod tests { let base = Base::Local(PathBuf::from("/tmp/lychee")); let input_source = InputSource::FsPath(PathBuf::from("page.html")); let base_info = - SourceBaseInfo::from_source(&input_source, None, None, Some(&base)).unwrap(); + SourceBaseInfo::from_source(&input_source, None, Some(&base)).unwrap(); let actual = create_request(&RawUri::from("file.html"), &input_source, &base_info, None).unwrap(); @@ -502,7 +501,7 @@ mod tests { let base = Base::Local(PathBuf::from("/")); let input_source = InputSource::FsPath(PathBuf::from("/tmp/lychee/page.html")); let base_info = - SourceBaseInfo::from_source(&input_source, None, None, Some(&base)).unwrap(); + SourceBaseInfo::from_source(&input_source, None, Some(&base)).unwrap(); // Use an absolute path that's outside the base directory let actual = create_request( From 9df0cc38bc4f141ab8f8ad1e4a87f495350df07e Mon Sep 17 00:00:00 2001 From: rina Date: Mon, 8 Sep 2025 00:47:54 +1000 Subject: [PATCH 36/59] fmt --- lychee-lib/src/utils/request.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 2d7ff9411d..c13fb8aa0e 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -476,8 +476,7 @@ mod tests { fn test_create_request_from_relative_file_path() { let base = Base::Local(PathBuf::from("/tmp/lychee")); let input_source = InputSource::FsPath(PathBuf::from("page.html")); - let base_info = - SourceBaseInfo::from_source(&input_source, None, Some(&base)).unwrap(); + let base_info = SourceBaseInfo::from_source(&input_source, None, Some(&base)).unwrap(); let actual = create_request(&RawUri::from("file.html"), &input_source, &base_info, None).unwrap(); @@ -500,8 +499,7 @@ mod tests { fn test_create_request_from_absolute_file_path() { let base = Base::Local(PathBuf::from("/")); let input_source = InputSource::FsPath(PathBuf::from("/tmp/lychee/page.html")); - let base_info = - SourceBaseInfo::from_source(&input_source, None, Some(&base)).unwrap(); + let base_info = SourceBaseInfo::from_source(&input_source, None, Some(&base)).unwrap(); // Use an absolute path that's outside the base directory let actual = create_request( From 4fb0e08ce5e9d590ba7dba4f79b8c0d5de6f9654 Mon Sep 17 00:00:00 2001 From: rina Date: Mon, 8 Sep 2025 01:05:04 +1000 Subject: [PATCH 37/59] Display Base --- lychee-lib/src/collector.rs | 4 ++-- lychee-lib/src/types/base.rs | 10 ++++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index b1bf71b67a..f7f91ebd2a 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -83,8 +83,8 @@ impl Collector { let root_and_base = match (root_dir, base) { (None, Some(base)) => { return Err(ErrorKind::InvalidBase( - format!("{base:?}"), - "base cannot be specified without root dir".to_string(), + base.to_string(), + "base must be specified alongside root dir, but root dir is unset".to_string(), )); } (None, None) => None, diff --git a/lychee-lib/src/types/base.rs b/lychee-lib/src/types/base.rs index 1e9e61bf86..56cc5f957f 100644 --- a/lychee-lib/src/types/base.rs +++ b/lychee-lib/src/types/base.rs @@ -1,5 +1,6 @@ use reqwest::Url; use serde::{Deserialize, Serialize}; +use std::fmt; use std::{convert::TryFrom, path::PathBuf}; use crate::{ErrorKind, InputSource}; @@ -84,6 +85,15 @@ impl TryFrom for Base { } } +impl fmt::Display for Base { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Local(path) => write!(f, "{}", path.display()), + Self::Remote(url) => write!(f, "{}", url), + } + } +} + #[cfg(test)] mod test_base { use crate::Result; From 910fdc84988276f86811748b6640885a6d5b8d61 Mon Sep 17 00:00:00 2001 From: rina Date: Mon, 8 Sep 2025 19:29:20 +1000 Subject: [PATCH 38/59] doc comments ig --- lychee-lib/src/types/base_info.rs | 56 ++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/lychee-lib/src/types/base_info.rs b/lychee-lib/src/types/base_info.rs index 6be0c80f36..64dd974fcf 100644 --- a/lychee-lib/src/types/base_info.rs +++ b/lychee-lib/src/types/base_info.rs @@ -9,10 +9,23 @@ use crate::types::uri::raw::RawUri; use crate::utils::url::ReqwestUrlExt; use url::PathSegmentsMut; +/// Information needed for resolving relative URLs within a particular +/// [`InputSource`]. The main entry point for constructing a `SourceBaseInfo` +/// is [`SourceBaseInfo::from_source`]. Once constructed, +/// [`SourceBaseInfo::parse_uri`] can be used to parse a URI found within +/// the `InputSource`. +/// +/// A `SourceBaseInfo` may or may not have an associated base which is used +/// for resolving relative URLs. If no base is available, parsing relative +/// and root-relative links will fail. If a base is available but it is not +/// *well-founded*, then parsing root-relative links will fail. See +/// [`SourceBaseInfo::from_source`] for a description of well-founded. #[derive(Debug, PartialEq, Eq, Clone)] pub struct SourceBaseInfo { - /// Tuple of `origin`, `subpath`, `allow_absolute` + /// Tuple of `origin`, `subpath`, `allow_absolute`. The field `allow_absolute` + /// is true if the base is well-founded. base: Option<(Url, String, bool)>, + /// List of tuples of `remote_url`, `local_url`. remote_local_mappings: Vec<(Url, Url)>, } @@ -53,6 +66,47 @@ impl SourceBaseInfo { Ok((origin, subpath, url.scheme() != "file")) } + /// Constructs a `SourceBaseInfo` from the given input source, root and base + /// pair, and fallback base. + /// + /// # Arguments + /// + /// * `source` - The input source which contains the links we want to resolve. + /// * `root_and_base` - An optional pair of root directory and base URL. The + /// somewhat complicated type encodes the fact that if a [`Base`] is provided, + /// then a [`Path`] must be provided too. If the base URL is omitted but root + /// dir is provided, the base URL defaults to the root dir. + /// * `fallback_base` - A fallback base URL to use where no other well-founded + /// base URL can be derived. If it is applied, the fallback base URL is + /// considered to be a well-founded base. + /// + /// # Root and base + /// + /// The given root and base URL are used to transform the intrinsic base returned + /// by [`InputSource::to_url`]. If the intrinsic base is a subpath of the given + /// root, then a new base is constructed by taking the intrinsic base and replacing + /// the root dir with the given base URL. + /// + /// In this way, links from local files can be resolved *as if* they were hosted + /// in a remote location at the base URL. Later, in [`SourceBaseInfo::parse_uri`], + /// remote links which are subpaths of the base URL will be reflected back to + /// local files within the root dir. + /// + /// # Well-founded bases + /// + /// Formally, a *well-founded* base is one which is derived from an input + /// source which is *not* a local file, or one derived from a local file + /// source which is a descendent of the given root dir. + /// + /// Informally, and importantly for using [`SourceBaseInfo`], a well-founded + /// base is one where we can sensibly resolve root-relative links (i.e., + /// relative links starting with `/`). + /// + /// # Errors + /// + /// This function fails with an [`Err`] if: + /// - any of the provided arguments cannot be converted to a URL, or + /// - [`SourceBaseInfo::new`] fails. pub fn from_source( source: &InputSource, root_and_base: Option<(&Path, Option<&Base>)>, From 80fc8c78aa876fa76b0bfba951b226f985ddef9f Mon Sep 17 00:00:00 2001 From: rina Date: Mon, 8 Sep 2025 01:34:35 +1000 Subject: [PATCH 39/59] root_and_base up to main --- fixtures/configs/smoketest.toml | 1 + lychee-bin/src/main.rs | 31 +++++++++++++++++-------------- lychee-bin/src/options.rs | 22 +++++++++++++++++----- lychee-lib/src/collector.rs | 22 ++++++---------------- 4 files changed, 41 insertions(+), 35 deletions(-) diff --git a/fixtures/configs/smoketest.toml b/fixtures/configs/smoketest.toml index 1e8874754b..10613766ce 100644 --- a/fixtures/configs/smoketest.toml +++ b/fixtures/configs/smoketest.toml @@ -81,6 +81,7 @@ remap = [ # Base URL or website root directory to check relative URLs. base_url = "https://example.com" +root_dir = "/tmp/root/dir" # HTTP basic auth support. This will be the username and password passed to the # authorization HTTP header. See diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index 8416ebcf80..42f34e3867 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -330,20 +330,23 @@ async fn run(opts: &LycheeOptions) -> Result { return Ok(exit_code as i32); } - let mut collector = Collector::new( - opts.config.root_dir.clone(), - base, - opts.config.fallback_base_url.clone(), - )? - .skip_missing_inputs(opts.config.skip_missing) - .skip_hidden(!opts.config.hidden) - .skip_ignored(!opts.config.no_ignore) - .include_verbatim(opts.config.include_verbatim) - .headers(HeaderMap::from_header_pairs(&opts.config.header)?) - .excluded_paths(PathExcludes::new(opts.config.exclude_path.clone())?) - // File a bug if you rely on this envvar! It's going to go away eventually. - .use_html5ever(std::env::var("LYCHEE_USE_HTML5EVER").is_ok_and(|x| x == "1")) - .include_wikilinks(opts.config.include_wikilinks); + let root_and_base = match (opts.config.root_dir.clone(), base) { + (None, None) => None, + (Some(root_dir), base) => Some((root_dir, base)), + // clap requirements should make this panic unreachable + (None, Some(_base)) => panic!("root dir must be specified when base is specified!"), + }; + + let mut collector = Collector::new(root_and_base, opts.config.fallback_base_url.clone())? + .skip_missing_inputs(opts.config.skip_missing) + .skip_hidden(!opts.config.hidden) + .skip_ignored(!opts.config.no_ignore) + .include_verbatim(opts.config.include_verbatim) + .headers(HeaderMap::from_header_pairs(&opts.config.header)?) + .excluded_paths(PathExcludes::new(opts.config.exclude_path.clone())?) + // File a bug if you rely on this envvar! It's going to go away eventually. + .use_html5ever(std::env::var("LYCHEE_USE_HTML5EVER").is_ok_and(|x| x == "1")) + .include_wikilinks(opts.config.include_wikilinks); collector = if let Some(ref basic_auth) = opts.config.basic_auth { collector.basic_auth_extractor(BasicAuthExtractor::new(basic_auth)?) diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index bef5c56760..36bff9cf1d 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -658,7 +658,7 @@ separated list of accepted status codes. This example will accept 200, 201, pub(crate) method: String, /// Deprecated; use `--base-url` instead - #[arg(long, value_parser = parse_base)] + #[arg(long, value_parser = parse_base, requires_if(ArgPredicate::IsPresent, "root_dir"))] #[serde(skip)] pub(crate) base: Option, @@ -855,42 +855,54 @@ impl Config { // Keys with defaults to assign accept: StatusCodeSelector::default(); + archive: None; + base: None; base_url: None; basic_auth: None; - cache_exclude_status: StatusCodeExcluder::default(); cache: false; + cache_exclude_status: StatusCodeExcluder::default(); cookie_jar: None; + dump: false; + dump_inputs: false; + exclude: Vec::::new(); exclude_all_private: false; exclude_file: Vec::::new(); // deprecated exclude_link_local: false; exclude_loopback: false; exclude_path: Vec::::new(); exclude_private: false; - exclude: Vec::::new(); extensions: FileType::default_extensions(); - fallback_extensions: Vec::::new(); fallback_base_url: None; + fallback_extensions: Vec::::new(); format: StatsFormat::default(); glob_ignore_case: false; header: Vec::<(String, String)>::new(); + hidden: false; + include: Vec::::new(); include_fragments: false; include_mail: false; include_verbatim: false; include_wikilinks: false; - include: Vec::::new(); + index_files: None; insecure: false; max_cache_age: humantime::parse_duration(DEFAULT_MAX_CACHE_AGE).unwrap(); max_concurrency: DEFAULT_MAX_CONCURRENCY; max_redirects: DEFAULT_MAX_REDIRECTS; max_retries: DEFAULT_MAX_RETRIES; method: DEFAULT_METHOD; + min_tls: None; + mode: OutputMode::Color; + no_ignore: false; no_progress: false; + offline: false; output: None; remap: Vec::::new(); require_https: false; retry_wait_time: DEFAULT_RETRY_WAIT_TIME_SECS; + root_dir: None; scheme: Vec::::new(); skip_missing: false; + suggest: false; threads: None; timeout: DEFAULT_TIMEOUT_SECS; user_agent: DEFAULT_USER_AGENT; diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index f7f91ebd2a..c7ba29dfc5 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -71,25 +71,14 @@ impl Collector { /// Returns an `Err` if the `root_dir` is not an absolute path /// or if the reqwest `Client` fails to build pub fn new( - root_dir: Option, - base: Option, + root_and_base: Option<(PathBuf, Option)>, fallback_base: Option, ) -> Result { - if let Some(root_dir) = &root_dir { + if let Some((root_dir, _)) = &root_and_base { if root_dir.is_relative() { return Err(ErrorKind::RootDirMustBeAbsolute(root_dir.clone())); } } - let root_and_base = match (root_dir, base) { - (None, Some(base)) => { - return Err(ErrorKind::InvalidBase( - base.to_string(), - "base must be specified alongside root dir, but root dir is unset".to_string(), - )); - } - (None, None) => None, - (Some(root_dir), base) => Some((root_dir, base)), - }; Ok(Collector { basic_auth_extractor: None, @@ -336,7 +325,8 @@ mod tests { ) -> Result> { // NOTE: base is passed as fallback_base because these tests are written // to test the old behaviour. - let responses = Collector::new(root_dir, None, fallback_base)?.collect_links(inputs); + let responses = + Collector::new(root_dir.map(|x| (x, None)), fallback_base)?.collect_links(inputs); Ok(responses.map(|r| r.unwrap().uri).collect().await) } @@ -350,7 +340,7 @@ mod tests { base: Option, extensions: FileExtensions, ) -> Result> { - let responses = Collector::new(root_dir, base, None)? + let responses = Collector::new(root_dir.map(|x| (x, base)), None)? .include_verbatim(true) .collect_links_from_file_types(inputs, extensions); Ok(responses.map(|r| r.unwrap().uri).collect().await) @@ -430,7 +420,7 @@ mod tests { }), ]); - let collector = Collector::new(Some(temp_dir_path.to_path_buf()), None, None)?; + let collector = Collector::new(Some((temp_dir_path.to_path_buf(), None)), None)?; let sources: Vec<_> = collector.collect_sources(inputs).collect().await; From d2ceb227482a5c0df94ed7e113f179aed30916b0 Mon Sep 17 00:00:00 2001 From: rina Date: Sat, 4 Oct 2025 21:17:26 +1000 Subject: [PATCH 40/59] fix main compilation --- lychee-bin/src/options.rs | 1 + lychee-lib/src/types/base.rs | 2 +- lychee-lib/src/types/base_info.rs | 4 +-- lychee-lib/src/types/input/source.rs | 40 ++++++++++++++-------------- lychee-lib/src/utils/request.rs | 5 +--- 5 files changed, 25 insertions(+), 27 deletions(-) diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index bb4192970d..944c6b3ba1 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -961,6 +961,7 @@ impl Config { exclude_private: false, extensions: FileType::default_extensions(), fallback_extensions: Vec::::new(), + fallback_base_url: None, format: StatsFormat::default(), glob_ignore_case: false, hidden: false, diff --git a/lychee-lib/src/types/base.rs b/lychee-lib/src/types/base.rs index e1c9df885a..0a454d28fd 100644 --- a/lychee-lib/src/types/base.rs +++ b/lychee-lib/src/types/base.rs @@ -53,7 +53,7 @@ impl Base { // We keep the username and password intact Some(Base::Remote(*base_url)) } - InputSource::FsPath(path) => path.clone().canonicalize().ok().map(Base::Local), + ResolvedInputSource::FsPath(path) => path.clone().canonicalize().ok().map(Base::Local), // other inputs do not have a URL to extract a base _ => None, } diff --git a/lychee-lib/src/types/base_info.rs b/lychee-lib/src/types/base_info.rs index 64dd974fcf..ea777d10a2 100644 --- a/lychee-lib/src/types/base_info.rs +++ b/lychee-lib/src/types/base_info.rs @@ -3,7 +3,7 @@ use std::path::Path; use crate::Base; use crate::ErrorKind; -use crate::InputSource; +use crate::ResolvedInputSource; use crate::Uri; use crate::types::uri::raw::RawUri; use crate::utils::url::ReqwestUrlExt; @@ -108,7 +108,7 @@ impl SourceBaseInfo { /// - any of the provided arguments cannot be converted to a URL, or /// - [`SourceBaseInfo::new`] fails. pub fn from_source( - source: &InputSource, + source: &ResolvedInputSource, root_and_base: Option<(&Path, Option<&Base>)>, fallback_base: Option<&Base>, ) -> Result { diff --git a/lychee-lib/src/types/input/source.rs b/lychee-lib/src/types/input/source.rs index 4b404b98d8..775bfef0e5 100644 --- a/lychee-lib/src/types/input/source.rs +++ b/lychee-lib/src/types/input/source.rs @@ -43,26 +43,6 @@ pub enum InputSource { String(Cow<'static, str>), } -impl InputSource { - /// Converts an [`InputSource::RemoteUrl`] or [`InputSource::FsPath`] - /// to a [`Url`] pointing to the source. - /// - /// The outer result indicates whether the operation succeeded. - /// For `InputSource` variants which are not `RemoteUrl` or `FsPath`, - /// the operation will "succeed" with `None`. - pub fn to_url(&self) -> Result, ErrorKind> { - match self { - Self::RemoteUrl(url) => Ok(Some(url.deref().clone())), - Self::FsPath(path) => std::path::absolute(path) - .ok() - .and_then(|x| Url::from_file_path(x).ok()) - .ok_or_else(|| ErrorKind::InvalidUrlFromPath(path.to_owned())) - .map(Some), - _ => Ok(None), - } - } -} - /// Resolved input sources that can be processed for content. /// /// This represents input sources after glob pattern expansion. @@ -83,6 +63,26 @@ pub enum ResolvedInputSource { String(Cow<'static, str>), } +impl ResolvedInputSource { + /// Converts an [`InputSource::RemoteUrl`] or [`InputSource::FsPath`] + /// to a [`Url`] pointing to the source. + /// + /// The outer result indicates whether the operation succeeded. + /// For `InputSource` variants which are not `RemoteUrl` or `FsPath`, + /// the operation will "succeed" with `None`. + pub fn to_url(&self) -> Result, ErrorKind> { + match self { + Self::RemoteUrl(url) => Ok(Some(url.deref().clone())), + Self::FsPath(path) => std::path::absolute(path) + .ok() + .and_then(|x| Url::from_file_path(x).ok()) + .ok_or_else(|| ErrorKind::InvalidUrlFromPath(path.to_owned())) + .map(Some), + _ => Ok(None), + } + } +} + impl From for InputSource { fn from(resolved: ResolvedInputSource) -> Self { match resolved { diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index a494072789..cc535e9c90 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -11,10 +11,8 @@ use crate::types::SourceBaseInfo; use crate::{ Base, BasicAuthCredentials, ErrorKind, Request, Result, Uri, basic_auth::BasicAuthExtractor, - types::{InputSource, uri::raw::RawUri}, - utils::{path, url, url::ReqwestUrlExt}, types::{ResolvedInputSource, uri::raw::RawUri}, - utils::{path, url}, + utils::{path, url, url::ReqwestUrlExt}, }; use ::url::ParseError; @@ -111,7 +109,6 @@ pub(crate) fn create( let base_info = match SourceBaseInfo::from_source(source, root_and_base, fallback_base) { Ok(base_info) => base_info, Err(e) => { - let source = truncate_source(source); warn!("Error handling source {source}: {e:?}"); return HashSet::new(); } From d2d359112c535184fff02334a38ae316a38b05a2 Mon Sep 17 00:00:00 2001 From: rina Date: Sat, 4 Oct 2025 21:18:45 +1000 Subject: [PATCH 41/59] fix test compilation too --- lychee-lib/src/types/base_info.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lychee-lib/src/types/base_info.rs b/lychee-lib/src/types/base_info.rs index ea777d10a2..5e4fab4c61 100644 --- a/lychee-lib/src/types/base_info.rs +++ b/lychee-lib/src/types/base_info.rs @@ -205,7 +205,7 @@ mod tests { fn test_base_with_filename() { let root_dir = PathBuf::from("/some"); let base = Base::try_from("https://example.com/path/page2.html").unwrap(); - let source = InputSource::FsPath(PathBuf::from("/some/page.html")); + let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html")); let base_info = SourceBaseInfo::from_source(&source, Some((&root_dir, Some(&base))), None).unwrap(); @@ -222,7 +222,7 @@ mod tests { fn test_base_with_same_filename() { let root_dir = PathBuf::from("/some/pagex.html"); let base = Base::try_from("https://example.com/path/page.html").unwrap(); - let source = InputSource::FsPath(PathBuf::from("/some/pagex.html")); + let source = ResolvedInputSource::FsPath(PathBuf::from("/some/pagex.html")); let base_info = SourceBaseInfo::from_source(&source, Some((&root_dir, Some(&base))), None).unwrap(); From 0fea5ebc15a4403384d3b5387f62859fc6ea5e21 Mon Sep 17 00:00:00 2001 From: rina Date: Tue, 18 Nov 2025 16:19:16 +1000 Subject: [PATCH 42/59] fix compilation --- lychee-lib/src/utils/request.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index a84a422be1..6b2fefe6dd 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -1,3 +1,4 @@ +use log::warn; use percent_encoding::percent_decode_str; use reqwest::Url; use std::borrow::Cow; @@ -109,7 +110,7 @@ pub(crate) fn create( Err(e) => { // TODO: return an error inside this vec. warn!("Error handling source {source}: {e:?}"); - return vec![]: + return vec![]; } }; From 9f79231b292fd483edabb95fc617604244e57746 Mon Sep 17 00:00:00 2001 From: rina Date: Tue, 18 Nov 2025 16:32:23 +1000 Subject: [PATCH 43/59] fix tests --- README.md | 40 ++++++++++++++++----------------- lychee-lib/src/utils/request.rs | 31 +++++++++++++------------ lychee.example.toml | 3 +++ 3 files changed, 38 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index f8ae0a2300..4413b23df5 100644 --- a/README.md +++ b/README.md @@ -474,6 +474,26 @@ Options: [default: compact] [possible values: compact, detailed, json, markdown, raw] + --fallback-base-url + Fallback base URL used for inputs where no more suitable base URL applies. + Each input source may have an associated base URL which describes where that + input was located, for the purpose of resolving relative links. Where Lychee + cannot determine a *well-founded* base URL for an input source, this fallback + base URL will be used. + + A *well-founded* base URL is one which: + - originates from a remote URL, in which case the base URL is just the remote URL, or + - originates from a local file where `--root-dir` has been specified and the local + file path is a subpath of `--root-dir`. + + In all other cases, the base URL is not well-founded and this fallback base URL + applies. In particular, this includes all links passed by stdin and, if `--root-dir` + is unspecified, this includes all links within local files. + + Note that this fallback base URL applies without consideration to local file paths. + For local files, it is usually better to specify `--base-url` and `--root-dir` + which will construct a base URL while considering subpaths of `--root-dir`. + --fallback-extensions When checking locally, attempts to locate missing files by trying the given fallback extensions. Multiple extensions can be separated by commas. Extensions @@ -678,26 +698,6 @@ Options: available at the remote URL of `--base-url`. See the help of `--base-url` for more information. - --fallback-base-url - Fallback base URL used for inputs where no more suitable base URL applies. - Each input source may have an associated base URL which describes where that - input was located, for the purpose of resolving relative links. Where Lychee - cannot determine a *well-founded* base URL for an input source, this fallback - base URL will be used. - - A *well-founded* base URL is one which: - - originates from a remote URL, in which case the base URL is just the remote URL, or - - originates from a local file where `--root-dir` has been specified and the local - file path is a subpath of `--root-dir`. - - In all other cases, the base URL is not well-founded and this fallback base URL - applies. In particular, this includes all links passed by stdin and, if `--root-dir` - is unspecified, this includes all links within local files. - - Note that this fallback base URL applies without consideration to local file paths. - For local files, it is usually better to specify `--base-url` and `--root-dir` - which will construct a base URL while considering subpaths of `--root-dir`. - -s, --scheme Only test links with the given schemes (e.g. https). Omit to check links with any other scheme. At the moment, we support http, https, file, and mailto. diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 6b2fefe6dd..9624d3239b 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -202,11 +202,11 @@ mod tests { fn create_ok_only( uris: Vec, source: &ResolvedInputSource, - root_dir: Option<&PathBuf>, - base: Option<&Base>, + root_and_base: Option<(&Path, Option<&Base>)>, + fallback_base: Option<&Base>, extractor: Option<&BasicAuthExtractor>, ) -> Vec { - create(uris, source, root_dir, base, extractor) + create(uris, source, root_and_base, fallback_base, extractor) .into_iter() .filter_map(Result::ok) .collect() @@ -489,7 +489,7 @@ mod tests { let source = ResolvedInputSource::String(Cow::Borrowed("")); let uris = vec![raw_uri("https://example.com/page")]; - let requests = create_ok_only_ok_only(uris, &source, None, None, None); + let requests = create_ok_only(uris, &source, None, None, None); assert_eq!(requests.len(), 1); assert!( @@ -500,13 +500,13 @@ mod tests { } #[test] - fn test_create_ok_only_request_from_relative_file_path() { + fn test_create_request_from_relative_file_path() { let base = Base::Local(PathBuf::from("/tmp/lychee")); let input_source = ResolvedInputSource::FsPath(PathBuf::from("page.html")); let base_info = SourceBaseInfo::from_source(&input_source, None, Some(&base)).unwrap(); let actual = - create_ok_only_request(&raw_uri("file.html"), &input_source, &base_info, None).unwrap(); + create_request(&raw_uri("file.html"), &input_source, &base_info, None).unwrap(); assert_eq!( actual, @@ -523,26 +523,25 @@ mod tests { } #[test] - fn test_create_ok_only_request_from_relative_file_path_errors() { + fn test_create_request_from_relative_file_path_errors() { // relative links unsupported from stdin assert!( - create_ok_only_request( + create_request( &raw_uri("file.html"), &ResolvedInputSource::Stdin, - None, - None, + &SourceBaseInfo::from_source(&ResolvedInputSource::Stdin, None, None).unwrap(), None, ) .is_err() ); // error because no root-dir and no base-url + let src = ResolvedInputSource::FsPath(PathBuf::from("page.html")); assert!( - create_ok_only_request( + create_request( &raw_uri("/file.html"), - &ResolvedInputSource::FsPath(PathBuf::from("page.html")), - None, - None, + &src, + &SourceBaseInfo::from_source(&src, None, None).unwrap(), None, ) .is_err() @@ -550,13 +549,13 @@ mod tests { } #[test] - fn test_create_ok_only_request_from_absolute_file_path() { + fn test_create_request_from_absolute_file_path() { let base = Base::Local(PathBuf::from("/")); let input_source = ResolvedInputSource::FsPath(PathBuf::from("/tmp/lychee/page.html")); let base_info = SourceBaseInfo::from_source(&input_source, None, Some(&base)).unwrap(); // Use an absolute path that's outside the base directory - let actual = create_ok_only_request( + let actual = create_request( &raw_uri("/usr/local/share/doc/example.html"), &input_source, &base_info, diff --git a/lychee.example.toml b/lychee.example.toml index e8989b67b4..96bc29c2b3 100644 --- a/lychee.example.toml +++ b/lychee.example.toml @@ -114,6 +114,9 @@ base_url = "https://example.com" # Root path to use when checking absolute local links, must be an absolute path root_dir = "/dist" +# Fallback base URL to use for input sources with no base URL +fallback_base_url = "https://example.com/fallback.html" + # HTTP basic auth support. This will be the username and password passed to the # authorization HTTP header. See # From 5aac1f1b5a68f7a34edc7be7f685db32a2ebbbaa Mon Sep 17 00:00:00 2001 From: rina Date: Tue, 18 Nov 2025 16:33:29 +1000 Subject: [PATCH 44/59] typo --- README.md | 2 +- lychee-bin/src/options.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4413b23df5..ae15dd4dad 100644 --- a/README.md +++ b/README.md @@ -693,7 +693,7 @@ Options: `/tmp/up.html` and not `/up.html`. This is because if `/tmp` was uploaded to a website root, traversing up beyond the root would not change the path. - Additiionally, this option can be specified alongside `--base-url`. If both are + Additionally, this option can be specified alongside `--base-url`. If both are given, the behavior is augmented to resolve links as if `--root-dir` was available at the remote URL of `--base-url`. See the help of `--base-url` for more information. diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index ec3309b955..641e72ed0f 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -791,7 +791,7 @@ If specified, `--root-dir` acts according to three main rules: `/tmp/up.html` and not `/up.html`. This is because if `/tmp` was uploaded to a website root, traversing up beyond the root would not change the path. -Additiionally, this option can be specified alongside `--base-url`. If both are +Additionally, this option can be specified alongside `--base-url`. If both are given, the behavior is augmented to resolve links as if `--root-dir` was available at the remote URL of `--base-url`. See the help of `--base-url` for more information." From aa7c3ba040710c6543cf05ac98a4916427c2fb49 Mon Sep 17 00:00:00 2001 From: rina Date: Wed, 10 Dec 2025 12:26:34 +1000 Subject: [PATCH 45/59] restore root-dir existence check, but ONLY for relative dirs ;-; --- lychee-lib/src/collector.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 7948ece5bc..d2a9176654 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -76,6 +76,14 @@ impl Collector { root_and_base: Option<(PathBuf, Option)>, fallback_base: Option, ) -> LycheeResult { + if let Some((root_dir, _)) = &root_and_base + && !root_dir.is_absolute() + { + match root_dir.metadata() { + Ok(_) => (), + Err(e) => return Err(ErrorKind::InvalidRootDir(root_dir.to_path_buf(), e)), + } + } Ok(Collector { basic_auth_extractor: None, skip_missing_inputs: false, From d84ac6d1fd33e611c5407a23cf722ff194f9ce35 Mon Sep 17 00:00:00 2001 From: rina Date: Wed, 10 Dec 2025 16:21:26 +1000 Subject: [PATCH 46/59] always check root-dir --- fixtures/configs/smoketest.toml | 2 +- lychee-lib/src/collector.rs | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/fixtures/configs/smoketest.toml b/fixtures/configs/smoketest.toml index a9eb6ece88..a721cf904e 100644 --- a/fixtures/configs/smoketest.toml +++ b/fixtures/configs/smoketest.toml @@ -81,7 +81,7 @@ remap = [ # Base URL or website root directory to check relative URLs. base_url = "https://example.com" -root_dir = "/tmp/root/dir" +root_dir = "." # HTTP basic auth support. This will be the username and password passed to the # authorization HTTP header. See diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index d2a9176654..e4d5c9aba1 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -76,9 +76,7 @@ impl Collector { root_and_base: Option<(PathBuf, Option)>, fallback_base: Option, ) -> LycheeResult { - if let Some((root_dir, _)) = &root_and_base - && !root_dir.is_absolute() - { + if let Some((root_dir, _)) = &root_and_base { match root_dir.metadata() { Ok(_) => (), Err(e) => return Err(ErrorKind::InvalidRootDir(root_dir.to_path_buf(), e)), From 70fbe33cd911ba96d862c2dba017556c06ea6d9a Mon Sep 17 00:00:00 2001 From: rina Date: Wed, 10 Dec 2025 16:32:43 +1000 Subject: [PATCH 47/59] todo --- lychee-lib/src/types/base_info.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lychee-lib/src/types/base_info.rs b/lychee-lib/src/types/base_info.rs index b7e15ab869..b786b36330 100644 --- a/lychee-lib/src/types/base_info.rs +++ b/lychee-lib/src/types/base_info.rs @@ -161,9 +161,12 @@ impl SourceBaseInfo { pub fn parse_uri(&self, raw_uri: &RawUri) -> Result { let is_absolute = || raw_uri.text.trim_ascii_start().starts_with('/'); + println!("{:?}", self); let Uri { url } = Uri::try_from(raw_uri.clone()).or_else(|e| match &self.base { Some((_, _, _allow_absolute @ false)) if is_absolute() => { + // TODO: report more errors if a --root-dir is specified but URL falls outside of + // thingy Err(ErrorKind::InvalidBaseJoin(raw_uri.text.clone())) } Some((origin, subpath, _)) => origin From ab41e77d8af37ae54d236ac567ad15bb9e4d5bc3 Mon Sep 17 00:00:00 2001 From: rina Date: Sun, 21 Dec 2025 15:49:05 +1000 Subject: [PATCH 48/59] what is happening T_T filename URLs are so hard. what can you even do???? what counts as a "subpath" of a URL with filename? what makes sense is to ignore the filename semantics and just use "file path semantics". so /a JOIN b would be /a/b. but this causes problems when we use URL operations which do URL joining. --- lychee-lib/src/utils/url.rs | 130 +++++++++++++++++++++++++++++++++++- 1 file changed, 127 insertions(+), 3 deletions(-) diff --git a/lychee-lib/src/utils/url.rs b/lychee-lib/src/utils/url.rs index 08df4c80e9..af7ecdfd9d 100644 --- a/lychee-lib/src/utils/url.rs +++ b/lychee-lib/src/utils/url.rs @@ -33,9 +33,58 @@ pub(crate) trait ReqwestUrlExt { impl ReqwestUrlExt for Url { fn strip_prefix(&self, prefix: &Url) -> Option { - prefix - .make_relative(self) - .filter(|subpath| !subpath.starts_with("../") && !subpath.starts_with('/')) + let mut prefix_segments = prefix.path_segments()?.peekable(); + let mut url_segments = self.path_segments()?.peekable(); + + // strip last component from prefix segments. this will either be + // a real non-empty filename, or an empty string if prefix ends in `/`. + let prefix_filename = prefix.path_segments()?.last(); + + if prefix_filename.is_some_and(|x| x == "") { + let _ = prefix_segments.next_back(); + } + + while let Some(s1) = prefix_segments.peek() + && let Some(s2) = url_segments.peek() + && s1 == s2 + { + let _ = prefix_segments.next(); + let _ = url_segments.next(); + } + + let remaining_prefix = prefix_segments.collect::>(); + let remaining_url = url_segments.collect::>(); + + println!("{:?}", remaining_prefix); + println!("{:?}", remaining_url); + + let relative = match (&remaining_prefix[..], &remaining_url[..]) { + ([], []) => Some(String::new()), + + // URL is a suffix of prefix (possibly aside from filename). + // we can just use the rest of the URL. + ([], rest) => match prefix_filename { + None | Some("") => rest.join("/"), + Some(filename) => format!("{filename}/{}", rest.join("/")), + }.into(), + + _ => None, + }; + + let relative = relative.map(|x| { + if x.starts_with("/") { + format!(".{x}") + } else { + x + } + }); + + println!("x={:?}", relative); + + relative + // prefix + // .make_relative(self) + // .filter(|subpath| !subpath.starts_with("../") && !subpath.starts_with('/')) // .inspect(|x| println!("subpathing {}", x)) // .filter(|_| prefix.as_str().starts_with(self.as_str())) } @@ -79,6 +128,12 @@ impl ReqwestUrlExt for Url { mod test_url_ext { use super::*; + macro_rules! url { + ($x: expr) => { + Url::parse($x).unwrap() + }; + } + #[test] fn test_strip_prefix() { // note trailing slashes for subpaths, otherwise everything becomes siblings @@ -96,6 +151,75 @@ mod test_url_ext { assert_eq!(goog_subpath.strip_prefix(&goog_subsubpath).as_deref(), None); } + + #[test] + fn test_fdsa() { + assert_eq!( + url!("https://a.com/b/x") + .strip_prefix(&url!("https://a.com/b/x")) + .as_deref(), + Some("") + ); + assert_eq!( + url!("https://a.com/b/x") + .strip_prefix(&url!("https://a.com/b/aa")) + .as_deref(), + None + ); + assert_eq!( + url!("https://a.com/b/x") + .strip_prefix(&url!("https://a.com/b/")) + .as_deref(), + Some("x") + ); + assert_eq!( + url!("https://a.com/b/x") + .strip_prefix(&url!("https://a.com/b")) + .as_deref(), + Some("b/x") + ); + assert_eq!( + url!("https://a.com/b/x") + .strip_prefix(&url!("https://a.com/a")) + .as_deref(), + None + ); + assert_eq!( + url!("https://a.com/b/x") + .strip_prefix(&url!("https://a.com/a/")) + .as_deref(), + None + ); + + assert_eq!( + url!("https://a.com/b//x") + .strip_prefix(&url!("https://a.com/b/")) + .as_deref(), + Some("./x") + ); + assert_eq!( + url!("https://a.com/b///x") + .strip_prefix(&url!("https://a.com/b/")) + .as_deref(), + Some(".//x") + ); + + println!( + "{:?}", + url!("https://a.com/b//x") + .path_segments() + .unwrap() + .collect::>() + ); + println!( + "{:?}", + url!("https://a.com/b/") + .path_segments() + .unwrap() + .collect::>() + ); + panic!(); + } } #[cfg(test)] From 36dcee794149486807466ac2e68d4b46111a45ff Mon Sep 17 00:00:00 2001 From: rina Date: Sun, 21 Dec 2025 17:13:51 +1000 Subject: [PATCH 49/59] do the thing with being super strict about URL filenames --- lychee-lib/src/utils/url.rs | 127 ++++++++++++++++++++++++++---------- 1 file changed, 91 insertions(+), 36 deletions(-) diff --git a/lychee-lib/src/utils/url.rs b/lychee-lib/src/utils/url.rs index af7ecdfd9d..b29d13a5e3 100644 --- a/lychee-lib/src/utils/url.rs +++ b/lychee-lib/src/utils/url.rs @@ -33,55 +33,76 @@ pub(crate) trait ReqwestUrlExt { impl ReqwestUrlExt for Url { fn strip_prefix(&self, prefix: &Url) -> Option { - let mut prefix_segments = prefix.path_segments()?.peekable(); - let mut url_segments = self.path_segments()?.peekable(); + if self.scheme() != prefix.scheme() + || self.authority() != prefix.authority() + || self.port() != prefix.port() + { + return None; + } + + let prefix_has_filename = prefix.path_segments()?.last().is_some_and(|x| x != ""); - // strip last component from prefix segments. this will either be - // a real non-empty filename, or an empty string if prefix ends in `/`. - let prefix_filename = prefix.path_segments()?.last(); + let relative = if prefix_has_filename { + if self.path() == prefix.path() { + Some(String::new()) + } else { + None + } + } else { + let mut prefix_segments = prefix.path_segments()?.peekable(); + let mut url_segments = self.path_segments()?.peekable(); - if prefix_filename.is_some_and(|x| x == "") { + // discard "" entry from the end of the prefix let _ = prefix_segments.next_back(); - } - while let Some(s1) = prefix_segments.peek() - && let Some(s2) = url_segments.peek() - && s1 == s2 - { - let _ = prefix_segments.next(); - let _ = url_segments.next(); - } + while let Some(s1) = prefix_segments.peek() + && let Some(s2) = url_segments.peek() + && s1 == s2 + { + let _ = prefix_segments.next(); + let _ = url_segments.next(); + } - let remaining_prefix = prefix_segments.collect::>(); - let remaining_url = url_segments.collect::>(); + let remaining_prefix = prefix_segments.collect::>(); + let remaining_url = url_segments.collect::>(); - println!("{:?}", remaining_prefix); - println!("{:?}", remaining_url); + println!("{:?}", remaining_prefix); + println!("{:?}", remaining_url); - let relative = match (&remaining_prefix[..], &remaining_url[..]) { - ([], []) => Some(String::new()), + let relative = match (&remaining_prefix[..], &remaining_url[..]) { + // if nothing is remaining in URL, then we have prefix=/a/, url=/a. + // this should NOT be considered a match. + ([], []) => None, - // URL is a suffix of prefix (possibly aside from filename). - // we can just use the rest of the URL. - ([], rest) => match prefix_filename { - None | Some("") => rest.join("/"), - Some(filename) => format!("{filename}/{}", rest.join("/")), - }.into(), + ([], rest) => Some(rest.join("/")), - _ => None, + _ => None, + }; + + relative.map(|x| { + if x.starts_with("/") { + format!(".{x}") + } else { + x + } + }) }; - let relative = relative.map(|x| { - if x.starts_with("/") { - format!(".{x}") - } else { - x + println!("x={:?}", relative); + + relative.map(|mut relative| { + if let Some(query) = self.query() { + relative.push('?'); + relative.push_str(query); } - }); - println!("x={:?}", relative); + if let Some(fragment) = self.fragment() { + relative.push('#'); + relative.push_str(fragment); + } + relative + }) - relative // prefix // .make_relative(self) // .filter(|subpath| !subpath.starts_with("../") && !subpath.starts_with('/')) @@ -154,29 +175,62 @@ mod test_url_ext { #[test] fn test_fdsa() { + // exact match assert_eq!( url!("https://a.com/b/x") .strip_prefix(&url!("https://a.com/b/x")) .as_deref(), Some("") ); + assert_eq!( + url!("https://a.com/b/") + .strip_prefix(&url!("https://a.com/b/")) + .as_deref(), + Some("") + ); + assert_eq!( + url!("https://a.com/b/x?a=2") + .strip_prefix(&url!("https://a.com/b/x?b=x")) + .as_deref(), + Some("?a=2") + ); + + // no matches due to / difference + assert_eq!( + url!("https://a.com/b") + .strip_prefix(&url!("https://a.com/b/")) + .as_deref(), + None + ); + assert_eq!( + url!("https://a.com/b/") + .strip_prefix(&url!("https://a.com/b")) + .as_deref(), + None + ); + + // changing filename leads to no match assert_eq!( url!("https://a.com/b/x") .strip_prefix(&url!("https://a.com/b/aa")) .as_deref(), None ); + + // matching in subdir assert_eq!( url!("https://a.com/b/x") .strip_prefix(&url!("https://a.com/b/")) .as_deref(), Some("x") ); + + // no match assert_eq!( url!("https://a.com/b/x") .strip_prefix(&url!("https://a.com/b")) .as_deref(), - Some("b/x") + None ); assert_eq!( url!("https://a.com/b/x") @@ -191,6 +245,7 @@ mod test_url_ext { None ); + // matches and maintains extra ./ inside url. assert_eq!( url!("https://a.com/b//x") .strip_prefix(&url!("https://a.com/b/")) From 15312f46ce2cfe15e98d2b8bd37182279fda2b68 Mon Sep 17 00:00:00 2001 From: rina Date: Sat, 24 Jan 2026 01:23:52 +1000 Subject: [PATCH 50/59] refactor to separate base info and mappings. neater but harder to use can we bundle these two things together somehow????????? maybe we just hoist the SourceBaseInfo up so there's fewer Path/Base things and less drilling of root_and_base. in fact, UrlMappings can be constant for an entire run. SourceBaseInfo is per source. how can we deal with fallback and mapping nicely??? fallback and mapping both affect the preparation and the parsing --- lychee-lib/src/types/base_info.rs | 255 ------------------------ lychee-lib/src/types/base_mapping.rs | 281 +++++++++++++++++++++++++++ lychee-lib/src/types/mod.rs | 4 +- lychee-lib/src/utils/request.rs | 47 +++-- 4 files changed, 313 insertions(+), 274 deletions(-) delete mode 100644 lychee-lib/src/types/base_info.rs create mode 100644 lychee-lib/src/types/base_mapping.rs diff --git a/lychee-lib/src/types/base_info.rs b/lychee-lib/src/types/base_info.rs deleted file mode 100644 index b786b36330..0000000000 --- a/lychee-lib/src/types/base_info.rs +++ /dev/null @@ -1,255 +0,0 @@ -use reqwest::Url; -use std::path::Path; - -use crate::Base; -use crate::ErrorKind; -use crate::ResolvedInputSource; -use crate::Uri; -use crate::types::uri::raw::RawUri; -use crate::utils::url::ReqwestUrlExt; -use url::PathSegmentsMut; - -/// Information needed for resolving relative URLs within a particular -/// [`InputSource`]. The main entry point for constructing a `SourceBaseInfo` -/// is [`SourceBaseInfo::from_source`]. Once constructed, -/// [`SourceBaseInfo::parse_uri`] can be used to parse a URI found within -/// the `InputSource`. -/// -/// A `SourceBaseInfo` may or may not have an associated base which is used -/// for resolving relative URLs. If no base is available, parsing relative -/// and root-relative links will fail. If a base is available but it is not -/// *well-founded*, then parsing root-relative links will fail. See -/// [`SourceBaseInfo::from_source`] for a description of well-founded. -#[derive(Debug, PartialEq, Eq, Clone)] -pub struct SourceBaseInfo { - /// Tuple of `origin`, `subpath`, `allow_absolute`. The field `allow_absolute` - /// is true if the base is well-founded. - base: Option<(Url, String, bool)>, - /// List of tuples of `remote_url`, `local_url`. - remote_local_mappings: Vec<(Url, Url)>, -} - -impl SourceBaseInfo { - pub fn new( - base: Option<(Url, String, bool)>, - remote_local_mappings: Vec<(Url, Url)>, - ) -> Result { - // TODO: check no repeated bases/roots on the same side. - // TODO: choose longest match if multiple could apply - let conflicting_mapping = remote_local_mappings.iter().find(|(remote, local)| { - if remote == local { - false - } else { - remote.strip_prefix(local).is_some() || local.strip_prefix(remote).is_some() - } - }); - - match conflicting_mapping { - Some((base, root)) => Err(ErrorKind::InvalidBase( - base.to_string(), - format!("base cannot be parent or child of root-dir {root}"), - )), - None => Ok(Self { - base, - remote_local_mappings, - }), - } - } - - fn infer_default_base(url: &Url) -> Result<(Url, String, bool), ErrorKind> { - let origin = url - .join("/") - .map_err(|e| ErrorKind::ParseUrl(e, url.to_string()))?; - let subpath = origin - .make_relative(url) - .expect("failed make a url relative to its own origin root?!"); - Ok((origin, subpath, url.scheme() != "file")) - } - - /// Constructs a `SourceBaseInfo` from the given input source, root and base - /// pair, and fallback base. - /// - /// # Arguments - /// - /// * `source` - The input source which contains the links we want to resolve. - /// * `root_and_base` - An optional pair of root directory and base URL. The - /// somewhat complicated type encodes the fact that if a [`Base`] is provided, - /// then a [`Path`] must be provided too. If the base URL is omitted but root - /// dir is provided, the base URL defaults to the root dir. - /// * `fallback_base` - A fallback base URL to use where no other well-founded - /// base URL can be derived. If it is applied, the fallback base URL is - /// considered to be a well-founded base. - /// - /// # Root and base - /// - /// The given root and base URL are used to transform the intrinsic base returned - /// by [`InputSource::to_url`]. If the intrinsic base is a subpath of the given - /// root, then a new base is constructed by taking the intrinsic base and replacing - /// the root dir with the given base URL. - /// - /// In this way, links from local files can be resolved *as if* they were hosted - /// in a remote location at the base URL. Later, in [`SourceBaseInfo::parse_uri`], - /// remote links which are subpaths of the base URL will be reflected back to - /// local files within the root dir. - /// - /// # Well-founded bases - /// - /// Formally, a *well-founded* base is one which is derived from an input - /// source which is *not* a local file, or one derived from a local file - /// source which is a descendent of the given root dir. - /// - /// Informally, and importantly for using [`SourceBaseInfo`], a well-founded - /// base is one where we can sensibly resolve root-relative links (i.e., - /// relative links starting with `/`). - /// - /// # Errors - /// - /// This function fails with an [`Err`] if: - /// - any of the provided arguments cannot be converted to a URL, or - /// - [`SourceBaseInfo::new`] fails. - pub fn from_source( - source: &ResolvedInputSource, - root_and_base: Option<(&Path, Option<&Base>)>, - fallback_base: Option<&Base>, - ) -> Result { - let root_and_base: Option<(Url, Url)> = match root_and_base { - Some((root, Some(base))) => Some((root, base.clone())), - Some((root, None)) => Some((root, Base::Local(root.to_owned()))), - None => None, - } - .map(|(root, base)| -> Result<_, ErrorKind> { - let root_url = Base::Local(root.to_owned()).to_url()?; - Ok((root_url, base.to_url()?)) - }) - .transpose()?; - - let source_url = source.to_url()?; - - let remote_local_mappings = match root_and_base { - Some((root_dir_url, base_url)) => vec![(base_url, root_dir_url)], - _ => vec![], - }; - - let fallback_base_url = fallback_base.map(Base::to_url).transpose()?; - let fallback_base_option = - move || fallback_base_url.map(|url| (url.clone(), String::new(), true)); - - let Some(source_url) = source_url else { - return Self::new(fallback_base_option(), remote_local_mappings); - }; - - let base = remote_local_mappings - .iter() - .find_map(|(remote, local)| { - source_url - .strip_prefix(local) - .map(|subpath| (remote.clone(), subpath, true)) - }) - .map_or_else( - || match Self::infer_default_base(&source_url) { - ok @ Ok((_, _, _allow_absolute @ false)) => { - fallback_base_option().map_or(ok, Ok) - } - Ok(x) => Ok(x), - Err(e) => fallback_base_option().ok_or(e), - }, - Ok, - )?; - - Self::new(Some(base), remote_local_mappings) - } - - pub fn parse_uri(&self, raw_uri: &RawUri) -> Result { - let is_absolute = || raw_uri.text.trim_ascii_start().starts_with('/'); - println!("{:?}", self); - - let Uri { url } = Uri::try_from(raw_uri.clone()).or_else(|e| match &self.base { - Some((_, _, _allow_absolute @ false)) if is_absolute() => { - // TODO: report more errors if a --root-dir is specified but URL falls outside of - // thingy - Err(ErrorKind::InvalidBaseJoin(raw_uri.text.clone())) - } - Some((origin, subpath, _)) => origin - .join_rooted(&[subpath, &raw_uri.text]) - .map_err(|e| ErrorKind::ParseUrl(e, raw_uri.text.clone())) - .map(|url| Uri { url }), - None => Err(e), - })?; - - // println!("before mappings: {}", url.as_str()); - - let mut url = self - .remote_local_mappings - .iter() - .find_map(|(remote, local)| { - url.strip_prefix(remote) - .and_then(|subpath| local.join(&subpath).ok()) - }) - .unwrap_or(url); - - // BACKWARDS COMPAT: delete trailing slash for file urls - if url.scheme() == "file" { - let _ = url - .path_segments_mut() - .as_mut() - .map(PathSegmentsMut::pop_if_empty); - } - - Ok(Uri { url }) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use std::num::NonZeroUsize; - use std::path::PathBuf; - - use crate::types::uri::raw::RawUriSpan; - - fn raw_uri(text: &'static str) -> RawUri { - RawUri { - text: text.to_string(), - element: None, - attribute: None, - span: RawUriSpan { - line: NonZeroUsize::MAX, - column: None, - }, - } - } - - #[test] - fn test_base_with_filename() { - let root_dir = PathBuf::from("/some"); - let base = Base::try_from("https://example.com/path/page2.html").unwrap(); - let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html")); - let base_info = - SourceBaseInfo::from_source(&source, Some((&root_dir, Some(&base))), None).unwrap(); - - assert_eq!( - base_info - .parse_uri(&raw_uri("#fragment")) - .as_ref() - .map(|x| x.url.as_str()), - Ok("file:///some/page.html#fragment") - ); - } - - #[test] - fn test_base_with_same_filename() { - let root_dir = PathBuf::from("/some/pagex.html"); - let base = Base::try_from("https://example.com/path/page.html").unwrap(); - let source = ResolvedInputSource::FsPath(PathBuf::from("/some/pagex.html")); - let base_info = - SourceBaseInfo::from_source(&source, Some((&root_dir, Some(&base))), None).unwrap(); - - assert_eq!( - base_info - .parse_uri(&raw_uri("#fragment")) - .as_ref() - .map(|x| x.url.as_str()), - Ok("file:///some/pagex.html#fragment") - ); - } -} diff --git a/lychee-lib/src/types/base_mapping.rs b/lychee-lib/src/types/base_mapping.rs new file mode 100644 index 0000000000..d76a64ed04 --- /dev/null +++ b/lychee-lib/src/types/base_mapping.rs @@ -0,0 +1,281 @@ +use reqwest::Url; +use std::path::Path; + +use crate::Base; +use crate::ErrorKind; +use crate::ResolvedInputSource; +use crate::Uri; +use crate::types::uri::raw::RawUri; +use crate::utils::url::ReqwestUrlExt; +use url::PathSegmentsMut; + +/// Information needed for resolving relative URLs within a particular +/// [`InputSource`]. The main entry point for constructing a `SourceBaseInfo` +/// is [`SourceBaseInfo::from_source`]. Once constructed, +/// [`SourceBaseInfo::parse_uri`] can be used to parse a URI found within +/// the `InputSource`. +/// +/// A `SourceBaseInfo` may or may not have an associated base which is used +/// for resolving relative URLs. If no base is available, parsing relative +/// and root-relative links will fail. If a base is available but it is not +/// *well-founded*, then parsing root-relative links will fail. See +/// [`SourceBaseInfo::from_source`] for a description of well-founded. +#[derive(Debug, PartialEq, Eq, Clone)] +pub struct SourceBaseInfo(Option<(Url, String, bool)>); +/// Tuple of `origin`, `subpath`, `allow_absolute`. The field `allow_absolute` +/// is true if the base is well-founded. + +pub struct UrlMappings { + /// List of tuples of `old_url`, `new_url`. + mappings: Vec<(Url, Url)>, +} + +impl UrlMappings { + pub fn new(mappings: Vec<(Url, Url)>) -> Result { + // TODO: check no repeated bases/roots on the same side. + // TODO: choose longest match if multiple could apply + let conflicting_mapping = mappings.iter().find(|(remote, local)| { + if remote == local { + false + } else { + remote.strip_prefix(local).is_some() || local.strip_prefix(remote).is_some() + } + }); + + match conflicting_mapping { + Some((base, root)) => Err(ErrorKind::InvalidBase( + base.to_string(), + format!("base cannot be parent or child of root-dir {root}"), + )), + None => Ok(Self { mappings }), + } + } + + pub fn map_to_old_url(&self, url: &Url) -> Option<(&Url, String)> { + self.mappings + .iter() + .find_map(|(left, right)| url.strip_prefix(left).map(|subpath| (right, subpath))) + } + + pub fn map_to_new_url(&self, url: &Url) -> Option<(&Url, String)> { + self.mappings + .iter() + .find_map(|(left, right)| url.strip_prefix(right).map(|subpath| (left, subpath))) + } +} + +impl SourceBaseInfo { + pub fn new( + origin: Url, + subpath: String, + supports_root_relative: bool, + ) -> Result { + Ok(Self(Some((origin, subpath, supports_root_relative)))) + } + + pub fn none() -> Self { + Self(None) + } + + pub fn supports_root_relative(&self) -> bool { + self.0.as_ref().is_some_and(|x| x.2) + } + + pub fn or_fallback(self, fallback: Self) -> Self { + if self.supports_root_relative() { + self + } else { + fallback + } + } + + pub fn infer_source_base(url: &Url) -> Result { + let origin = url + .join("/") + .map_err(|e| ErrorKind::ParseUrl(e, url.to_string()))?; + let subpath = origin + .make_relative(url) + .expect("failed make a url relative to its own origin root?!"); + Self::new(origin, subpath, url.scheme() != "file") + } + + pub fn parse_raw_uri(&self, raw_uri: &RawUri) -> Result { + let is_root_relative = || { + let text = raw_uri.text.trim_ascii_start(); + text.starts_with('/') && !text.starts_with("//") + }; + + Uri::try_from(raw_uri.clone()) + .or_else(|e| match self { + _ if is_root_relative() && !self.supports_root_relative() => { + // TODO: report more errors if a --root-dir is specified but URL falls outside of + // thingy + Err(ErrorKind::InvalidBaseJoin(raw_uri.text.clone())) + } + Self(Some((origin, subpath, _))) => origin + .join_rooted(&[subpath, &raw_uri.text]) + .map_err(|e| ErrorKind::ParseUrl(e, raw_uri.text.clone())) + .map(|url| Uri { url }), + Self(None) => Err(e), + }) + .map(|x| x.url) + } + + // Constructs a `SourceBaseInfo` from the given input source, root and base + // pair, and fallback base. + // + // # Arguments + // + // * `source` - The input source which contains the links we want to resolve. + // * `root_and_base` - An optional pair of root directory and base URL. The + // somewhat complicated type encodes the fact that if a [`Base`] is provided, + // then a [`Path`] must be provided too. If the base URL is omitted but root + // dir is provided, the base URL defaults to the root dir. + // * `fallback_base` - A fallback base URL to use where no other well-founded + // base URL can be derived. If it is applied, the fallback base URL is + // considered to be a well-founded base. + // + // # Root and base + // + // The given root and base URL are used to transform the intrinsic base returned + // by [`InputSource::to_url`]. If the intrinsic base is a subpath of the given + // root, then a new base is constructed by taking the intrinsic base and replacing + // the root dir with the given base URL. + // + // In this way, links from local files can be resolved *as if* they were hosted + // in a remote location at the base URL. Later, in [`SourceBaseInfo::parse_uri`], + // remote links which are subpaths of the base URL will be reflected back to + // local files within the root dir. + // + // # Well-founded bases + // + // Formally, a *well-founded* base is one which is derived from an input + // source which is *not* a local file, or one derived from a local file + // source which is a descendent of the given root dir. + // + // Informally, and importantly for using [`SourceBaseInfo`], a well-founded + // base is one where we can sensibly resolve root-relative links (i.e., + // relative links starting with `/`). + // + // # Errors + // + // This function fails with an [`Err`] if: + // - any of the provided arguments cannot be converted to a URL, or + // - [`SourceBaseInfo::new`] fails. +} + +pub fn prepare_source_base_info( + source: &ResolvedInputSource, + root_and_base: Option<(&Path, Option<&Base>)>, + fallback_base: Option<&Base>, +) -> Result<(SourceBaseInfo, UrlMappings), ErrorKind> { + // TODO: get rid of the Path/Base complication + let root_and_base: Option<(Url, Url)> = match root_and_base { + Some((root, Some(base))) => Some((root, base.clone())), + Some((root, None)) => Some((root, Base::Local(root.to_owned()))), + None => None, + } + .map(|(root, base)| -> Result<_, ErrorKind> { + let root_url = Base::Local(root.to_owned()).to_url()?; + Ok((root_url, base.to_url()?)) + }) + .transpose()?; + + let fallback_base = match fallback_base.map(Base::to_url).transpose()? { + None => SourceBaseInfo::none(), + Some(fallback_url) => SourceBaseInfo::new(fallback_url, String::new(), true)?, + }; + + let mappings = UrlMappings::new(root_and_base.into_iter().collect())?; + + let base_info = match source.to_url()? { + Some(source_url) => match mappings.map_to_old_url(&source_url) { + Some((remote, subpath)) => SourceBaseInfo::new(remote.clone(), subpath, true)?, + None => SourceBaseInfo::infer_source_base(&source_url)?, + }, + None => SourceBaseInfo::none(), + }; + + let base_info = base_info.or_fallback(fallback_base); + + Ok((base_info, mappings)) +} + +pub fn parse_url_with_base_info( + base_info: &SourceBaseInfo, + mappings: &UrlMappings, + raw_uri: &RawUri, +) -> Result { + let url = base_info.parse_raw_uri(raw_uri)?; + + let mut url = match mappings.map_to_new_url(&url) { + Some((local, subpath)) => local.join(&subpath).ok(), + None => None, + } + .unwrap_or(url); + + // BACKWARDS COMPAT: delete trailing slash for file urls + if url.scheme() == "file" { + let _ = url + .path_segments_mut() + .as_mut() + .map(PathSegmentsMut::pop_if_empty); + } + + Ok(Uri { url }) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::num::NonZeroUsize; + use std::path::PathBuf; + + use crate::types::uri::raw::RawUriSpan; + + fn raw_uri(text: &'static str) -> RawUri { + RawUri { + text: text.to_string(), + element: None, + attribute: None, + span: RawUriSpan { + line: NonZeroUsize::MAX, + column: None, + }, + } + } + + // #[test] + // fn test_base_with_filename() { + // let root_dir = PathBuf::from("/some"); + // let base = Base::try_from("https://example.com/path/page2.html").unwrap(); + // let source = ResolvedInputSource::FsPath(PathBuf::from("/some/page.html")); + // let base_info = + // SourceBaseInfo::from_source(&source, Some((&root_dir, Some(&base))), None).unwrap(); + // + // assert_eq!( + // base_info + // .parse_uri(&raw_uri("#fragment")) + // .as_ref() + // .map(|x| x.url.as_str()), + // Ok("file:///some/page.html#fragment") + // ); + // } + // + // #[test] + // fn test_base_with_same_filename() { + // let root_dir = PathBuf::from("/some/pagex.html"); + // let base = Base::try_from("https://example.com/path/page.html").unwrap(); + // let source = ResolvedInputSource::FsPath(PathBuf::from("/some/pagex.html")); + // let base_info = + // SourceBaseInfo::from_source(&source, Some((&root_dir, Some(&base))), None).unwrap(); + // + // assert_eq!( + // base_info + // .parse_uri(&raw_uri("#fragment")) + // .as_ref() + // .map(|x| x.url.as_str()), + // Ok("file:///some/pagex.html#fragment") + // ); + // } +} diff --git a/lychee-lib/src/types/mod.rs b/lychee-lib/src/types/mod.rs index f6406b5303..50982378da 100644 --- a/lychee-lib/src/types/mod.rs +++ b/lychee-lib/src/types/mod.rs @@ -2,7 +2,7 @@ mod accept; mod base; -mod base_info; +pub(crate) mod base_mapping; mod basic_auth; mod cache; mod cookies; @@ -22,7 +22,7 @@ pub(crate) mod uri; pub use accept::*; pub use base::Base; -pub use base_info::SourceBaseInfo; +pub use base_mapping::{SourceBaseInfo, UrlMappings}; pub use basic_auth::{BasicAuthCredentials, BasicAuthSelector}; pub use cache::CacheStatus; pub use cookies::CookieJar; diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 9624d3239b..3355acf015 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -6,6 +6,7 @@ use std::collections::HashSet; use std::path::{Path, PathBuf}; use crate::types::SourceBaseInfo; +use crate::types::base_mapping; use crate::{ Base, BasicAuthCredentials, ErrorKind, LycheeResult, Request, RequestError, Uri, basic_auth::BasicAuthExtractor, @@ -29,7 +30,8 @@ fn create_request( base_info: &SourceBaseInfo, extractor: Option<&BasicAuthExtractor>, ) -> LycheeResult { - let uri = base_info.parse_uri(raw_uri)?; + // WARN: BROKEN because this needs to do all mapping. + let uri = Uri { url: base_info.parse_raw_uri(raw_uri)? }; let source = source.clone(); let element = raw_uri.element.clone(); let attribute = raw_uri.attribute.clone(); @@ -47,11 +49,16 @@ fn try_parse_into_uri( base: Option<&Base>, ) -> LycheeResult { // HACK: if only base_url is specified, use that as a fallback_base_url. - let base_info = match (root_dir, base) { - (None, base) => SourceBaseInfo::from_source(source, None, base), - (Some(root_dir), base) => SourceBaseInfo::from_source(source, Some((root_dir, base)), None), + let (a, b) = match (root_dir, base) { + (None, base) => base_mapping::prepare_source_base_info(source, None, base), + (Some(root_dir), base) => base_mapping::prepare_source_base_info( + source, + Some((root_dir, base)), + None, + ), }?; - base_info.parse_uri(raw_uri) + + base_mapping::parse_url_with_base_info(&a, &b, raw_uri) } // Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs @@ -105,24 +112,30 @@ pub(crate) fn create( fallback_base: Option<&Base>, extractor: Option<&BasicAuthExtractor>, ) -> Vec> { - let base_info = match SourceBaseInfo::from_source(source, root_and_base, fallback_base) { - Ok(base_info) => base_info, - Err(e) => { - // TODO: return an error inside this vec. - warn!("Error handling source {source}: {e:?}"); - return vec![]; - } - }; + let (base_info, mappings) = + match base_mapping::prepare_source_base_info(source, root_and_base, fallback_base) { + Ok(base_info) => base_info, + Err(e) => { + // TODO: return an error inside this vec. + warn!("Error handling source {source}: {e:?}"); + return vec![]; + } + }; let mut requests = HashSet::::new(); let mut errors = Vec::::new(); for raw_uri in uris { - let result = create_request(&raw_uri, source, &base_info, extractor); - match result { - Ok(request) => { - requests.insert(request); + match base_mapping::parse_url_with_base_info(&base_info, &mappings, &raw_uri) { + Ok(uri) => { + let source = source.clone(); + let element = raw_uri.element.clone(); + let attribute = raw_uri.attribute.clone(); + let credentials = extract_credentials(extractor, &uri); + + requests.insert(Request::new(uri, source, element, attribute, credentials)); } + Err(e) => errors.push(RequestError::CreateRequestItem( raw_uri.clone(), source.clone(), From c6121a8aad48c2d174a33032b2cf2e6ab2a64521 Mon Sep 17 00:00:00 2001 From: rina Date: Sat, 24 Jan 2026 16:16:32 +1000 Subject: [PATCH 51/59] touch --- lychee-lib/src/types/base_mapping.rs | 20 ++++++++------------ lychee-lib/src/types/uri/raw.rs | 9 +++++++++ lychee-lib/src/utils/request.rs | 12 ++++++------ 3 files changed, 23 insertions(+), 18 deletions(-) diff --git a/lychee-lib/src/types/base_mapping.rs b/lychee-lib/src/types/base_mapping.rs index d76a64ed04..9c7cf377e8 100644 --- a/lychee-lib/src/types/base_mapping.rs +++ b/lychee-lib/src/types/base_mapping.rs @@ -100,25 +100,21 @@ impl SourceBaseInfo { } pub fn parse_raw_uri(&self, raw_uri: &RawUri) -> Result { - let is_root_relative = || { - let text = raw_uri.text.trim_ascii_start(); - text.starts_with('/') && !text.starts_with("//") - }; - - Uri::try_from(raw_uri.clone()) - .or_else(|e| match self { - _ if is_root_relative() && !self.supports_root_relative() => { + match Uri::try_from(raw_uri.clone()) { + Ok(Uri { url }) => Ok(url), + Err(e @ ErrorKind::ParseUrl(_, _)) => match self { + _ if raw_uri.is_root_relative() && !self.supports_root_relative() => { // TODO: report more errors if a --root-dir is specified but URL falls outside of // thingy Err(ErrorKind::InvalidBaseJoin(raw_uri.text.clone())) } Self(Some((origin, subpath, _))) => origin .join_rooted(&[subpath, &raw_uri.text]) - .map_err(|e| ErrorKind::ParseUrl(e, raw_uri.text.clone())) - .map(|url| Uri { url }), + .map_err(|e| ErrorKind::ParseUrl(e, raw_uri.text.clone())), Self(None) => Err(e), - }) - .map(|x| x.url) + }, + Err(e) => Err(e), + } } // Constructs a `SourceBaseInfo` from the given input source, root and base diff --git a/lychee-lib/src/types/uri/raw.rs b/lychee-lib/src/types/uri/raw.rs index 026ee75821..e5716b0dc4 100644 --- a/lychee-lib/src/types/uri/raw.rs +++ b/lychee-lib/src/types/uri/raw.rs @@ -21,6 +21,15 @@ pub struct RawUri { pub span: RawUriSpan, } +impl RawUri { + /// Returns whether the `RawUri` represents a relative link that is + /// relative to the domain root. Textually, it looks like `/this`. + pub fn is_root_relative(&self) -> bool { + let text = self.text.trim_ascii_start(); + text.starts_with('/') && !text.starts_with("//") + } +} + impl Display for RawUri { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{:?} (Attribute: {:?})", self.text, self.attribute) diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 3355acf015..ec717a9bff 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -31,7 +31,9 @@ fn create_request( extractor: Option<&BasicAuthExtractor>, ) -> LycheeResult { // WARN: BROKEN because this needs to do all mapping. - let uri = Uri { url: base_info.parse_raw_uri(raw_uri)? }; + let uri = Uri { + url: base_info.parse_raw_uri(raw_uri)?, + }; let source = source.clone(); let element = raw_uri.element.clone(); let attribute = raw_uri.attribute.clone(); @@ -51,11 +53,9 @@ fn try_parse_into_uri( // HACK: if only base_url is specified, use that as a fallback_base_url. let (a, b) = match (root_dir, base) { (None, base) => base_mapping::prepare_source_base_info(source, None, base), - (Some(root_dir), base) => base_mapping::prepare_source_base_info( - source, - Some((root_dir, base)), - None, - ), + (Some(root_dir), base) => { + base_mapping::prepare_source_base_info(source, Some((root_dir, base)), None) + } }?; base_mapping::parse_url_with_base_info(&a, &b, raw_uri) From 1a14c40cefb929270a42e2b08e1377b5a44225e5 Mon Sep 17 00:00:00 2001 From: rina Date: Sat, 24 Jan 2026 16:21:28 +1000 Subject: [PATCH 52/59] one less clone --- lychee-lib/src/types/base_mapping.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lychee-lib/src/types/base_mapping.rs b/lychee-lib/src/types/base_mapping.rs index 9c7cf377e8..ffd58c5a65 100644 --- a/lychee-lib/src/types/base_mapping.rs +++ b/lychee-lib/src/types/base_mapping.rs @@ -100,7 +100,7 @@ impl SourceBaseInfo { } pub fn parse_raw_uri(&self, raw_uri: &RawUri) -> Result { - match Uri::try_from(raw_uri.clone()) { + match Uri::try_from(raw_uri.text.as_ref()) { Ok(Uri { url }) => Ok(url), Err(e @ ErrorKind::ParseUrl(_, _)) => match self { _ if raw_uri.is_root_relative() && !self.supports_root_relative() => { From d146124e74fd93a91a7748ed7e28e292b6a3e796 Mon Sep 17 00:00:00 2001 From: rina Date: Sat, 24 Jan 2026 16:36:51 +1000 Subject: [PATCH 53/59] clean up root_and_base --- lychee-lib/src/types/base_mapping.rs | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/lychee-lib/src/types/base_mapping.rs b/lychee-lib/src/types/base_mapping.rs index ffd58c5a65..4504750dd1 100644 --- a/lychee-lib/src/types/base_mapping.rs +++ b/lychee-lib/src/types/base_mapping.rs @@ -167,15 +167,13 @@ pub fn prepare_source_base_info( ) -> Result<(SourceBaseInfo, UrlMappings), ErrorKind> { // TODO: get rid of the Path/Base complication let root_and_base: Option<(Url, Url)> = match root_and_base { - Some((root, Some(base))) => Some((root, base.clone())), - Some((root, None)) => Some((root, Base::Local(root.to_owned()))), + Some((root, base_option)) => { + let root = Base::Local(root.to_owned()).to_url()?; + let base = base_option.map_or_else(|| Ok(root.clone()), Base::to_url)?; + Some((root, base)) + } None => None, - } - .map(|(root, base)| -> Result<_, ErrorKind> { - let root_url = Base::Local(root.to_owned()).to_url()?; - Ok((root_url, base.to_url()?)) - }) - .transpose()?; + }; let fallback_base = match fallback_base.map(Base::to_url).transpose()? { None => SourceBaseInfo::none(), From 5c53ec07bbe81cfe0edb008ac669d4fd71909b5c Mon Sep 17 00:00:00 2001 From: rina Date: Sat, 24 Jan 2026 16:44:49 +1000 Subject: [PATCH 54/59] comments --- lychee-lib/src/types/base_mapping.rs | 2 +- lychee-lib/src/utils/request.rs | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/lychee-lib/src/types/base_mapping.rs b/lychee-lib/src/types/base_mapping.rs index 4504750dd1..21274692d6 100644 --- a/lychee-lib/src/types/base_mapping.rs +++ b/lychee-lib/src/types/base_mapping.rs @@ -165,8 +165,8 @@ pub fn prepare_source_base_info( root_and_base: Option<(&Path, Option<&Base>)>, fallback_base: Option<&Base>, ) -> Result<(SourceBaseInfo, UrlMappings), ErrorKind> { - // TODO: get rid of the Path/Base complication let root_and_base: Option<(Url, Url)> = match root_and_base { + // if root is specified but not base, use root dir as the base as well. Some((root, base_option)) => { let root = Base::Local(root.to_owned()).to_url()?; let base = base_option.map_or_else(|| Ok(root.clone()), Base::to_url)?; diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index ec717a9bff..eeec528479 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -112,11 +112,13 @@ pub(crate) fn create( fallback_base: Option<&Base>, extractor: Option<&BasicAuthExtractor>, ) -> Vec> { + // TODO: it would probably be nice to inline prepare_source_base_info into this function. + // however, it uses a lot of `.?` and we need to catch and handle all those errors here. let (base_info, mappings) = match base_mapping::prepare_source_base_info(source, root_and_base, fallback_base) { Ok(base_info) => base_info, Err(e) => { - // TODO: return an error inside this vec. + // TODO: IMPORTANT! return an error inside this vec. warn!("Error handling source {source}: {e:?}"); return vec![]; } From 1437de12aab8c5c715c344baed07cdaab10fc5e6 Mon Sep 17 00:00:00 2001 From: rina Date: Sat, 24 Jan 2026 18:54:58 +1000 Subject: [PATCH 55/59] flatten SourceBaseInfo options, and write a lot. --- lychee-lib/src/types/base_mapping.rs | 239 ++++++++++++++++++--------- lychee-lib/src/types/uri/raw.rs | 9 - lychee-lib/src/utils/request.rs | 2 +- 3 files changed, 158 insertions(+), 92 deletions(-) diff --git a/lychee-lib/src/types/base_mapping.rs b/lychee-lib/src/types/base_mapping.rs index 21274692d6..969a436364 100644 --- a/lychee-lib/src/types/base_mapping.rs +++ b/lychee-lib/src/types/base_mapping.rs @@ -1,3 +1,7 @@ +//! Parses and resolves [`RawUri`] into into fully-qualified [`Uri`] by +//! applying base URL and root dir mappings. +//! + use reqwest::Url; use std::path::Path; @@ -9,109 +13,141 @@ use crate::types::uri::raw::RawUri; use crate::utils::url::ReqwestUrlExt; use url::PathSegmentsMut; -/// Information needed for resolving relative URLs within a particular -/// [`InputSource`]. The main entry point for constructing a `SourceBaseInfo` -/// is [`SourceBaseInfo::from_source`]. Once constructed, -/// [`SourceBaseInfo::parse_uri`] can be used to parse a URI found within -/// the `InputSource`. +/// Information used for resolving relative URLs within a particular +/// input source. There should be a 1:1 correspondence between each +/// `SourceBaseInfo` and its originating `InputSource`. The main entry +/// point for constructing is [`SourceBaseInfo::from_source_url`]. +/// +/// Once constructed, [`SourceBaseInfo::parse_url_text`] can be used to +/// parse and resolve a (possibly relative) URL obtained from within +/// the associated `InputSource`. /// -/// A `SourceBaseInfo` may or may not have an associated base which is used -/// for resolving relative URLs. If no base is available, parsing relative -/// and root-relative links will fail. If a base is available but it is not -/// *well-founded*, then parsing root-relative links will fail. See -/// [`SourceBaseInfo::from_source`] for a description of well-founded. +/// A `SourceBaseInfo` may be built from input sources which cannot resolve +/// relative links---for instance, stdin. It may also be built from input +/// sources which can resolve *locally*-relative links, but not *root*-relative +/// links. #[derive(Debug, PartialEq, Eq, Clone)] -pub struct SourceBaseInfo(Option<(Url, String, bool)>); -/// Tuple of `origin`, `subpath`, `allow_absolute`. The field `allow_absolute` -/// is true if the base is well-founded. +pub enum SourceBaseInfo { + /// No base information is available. This is for sources with no base + /// information, such as [`ResolvedInputSource::Stdin`]. This can + /// resolve no relative links, and only fully-qualified links will be + /// parsed successfully. + None, -pub struct UrlMappings { - /// List of tuples of `old_url`, `new_url`. - mappings: Vec<(Url, Url)>, -} + /// A base which cannot resolve root-relative links. This is for + /// `file:` URLs where the root directory is not known. As such, you can + /// traverse relative to the current URL (by traversing the filesystem), + /// but you cannot jump to the "root". + NoRoot(Url), -impl UrlMappings { - pub fn new(mappings: Vec<(Url, Url)>) -> Result { - // TODO: check no repeated bases/roots on the same side. - // TODO: choose longest match if multiple could apply - let conflicting_mapping = mappings.iter().find(|(remote, local)| { - if remote == local { - false - } else { - remote.strip_prefix(local).is_some() || local.strip_prefix(remote).is_some() - } - }); + /// A full base made up of `origin` and `path`. This can resolve + /// all kinds of relative links. + /// + /// All fully-qualified non-`file:` URLs fall into this case. For these, + /// `origin` and `path` are obtained by dividing the source URL into its + /// origin and path. When joined, `${origin}/${path}` should be equivalent + /// to the source's original URL. + /// + /// For `file:` URLs, the `origin` serves as the root which will be used + /// to resolve root-relative links (i.e., it's the root dir). The `path` + /// field is the subpath to a particular file within the root dir. This + /// is retained to resolve locally-relative links. + Full(Url, String), +} - match conflicting_mapping { - Some((base, root)) => Err(ErrorKind::InvalidBase( - base.to_string(), - format!("base cannot be parent or child of root-dir {root}"), - )), - None => Ok(Self { mappings }), - } +impl SourceBaseInfo { + /// Constructs [`SourceBaseInfo::None`]. + pub fn no_info() -> Self { + Self::None } - pub fn map_to_old_url(&self, url: &Url) -> Option<(&Url, String)> { - self.mappings - .iter() - .find_map(|(left, right)| url.strip_prefix(left).map(|subpath| (right, subpath))) + /// Constructs [`SourceBaseInfo::Full`] with the given fields. + pub fn full_info(origin: Url, path: String) -> Self { + Self::Full(origin, path) } - pub fn map_to_new_url(&self, url: &Url) -> Option<(&Url, String)> { - self.mappings - .iter() - .find_map(|(left, right)| url.strip_prefix(right).map(|subpath| (left, subpath))) - } -} + /// Constructs a [`SourceBaseInfo`], with the variant being determined by the given URL. + /// + /// - A [`Url::cannot_be_a_base`] URL will yield [`SourceBaseInfo::None`]. + /// - A `file:` URL will yield [`SourceBaseInfo::NoRoot`]. + /// - For other URLs, a [`SourceBaseInfo::Full`] will be constructed from the URL's + /// origin and path. + /// + pub fn from_source_url(url: &Url) -> Self { + if url.scheme() == "file" { + Self::NoRoot(url.clone()) + } else { + let mut origin = url.clone(); -impl SourceBaseInfo { - pub fn new( - origin: Url, - subpath: String, - supports_root_relative: bool, - ) -> Result { - Ok(Self(Some((origin, subpath, supports_root_relative)))) - } + match origin.path_segments_mut() { + Ok(mut segments) => segments.clear(), + Err(()) => return Self::no_info(), + }; + + let path = match url.path().strip_prefix('/') { + Some(path) => path.to_string(), + None => return Self::no_info(), + }; - pub fn none() -> Self { - Self(None) + Self::Full(origin, path) + } } pub fn supports_root_relative(&self) -> bool { - self.0.as_ref().is_some_and(|x| x.2) + matches!(self, Self::Full(_, _)) } + pub fn supports_locally_relative(&self) -> bool { + !matches!(self, Self::None) + } + + /// Returns the [`SourceBaseInfo`] which has _more information_ + /// between `self` and the given `fallback`. + /// + /// [`SourceBaseInfo::Full`] is preferred over [`SourceBaseInfo::NoRoot`] + /// which is preferred over [`SourceBaseInfo::None`]. If both `self` + /// and `fallback` are the same variant, then `self` will be preferred. pub fn or_fallback(self, fallback: Self) -> Self { - if self.supports_root_relative() { - self - } else { - fallback + match (self, fallback) { + (x @ Self::Full(_, _), _) => x, + (_, x @ Self::Full(_, _)) => x, + (x @ Self::NoRoot(_), _) => x, + (_, x @ Self::NoRoot(_)) => x, + (Self::None, Self::None) => Self::None, } } - pub fn infer_source_base(url: &Url) -> Result { - let origin = url - .join("/") - .map_err(|e| ErrorKind::ParseUrl(e, url.to_string()))?; - let subpath = origin - .make_relative(url) - .expect("failed make a url relative to its own origin root?!"); - Self::new(origin, subpath, url.scheme() != "file") + /// Returns whether the text represents a relative link that is + /// relative to the domain root. Textually, it looks like `/this`. + fn is_root_relative(text: &str) -> bool { + let text = text.trim_ascii_start(); + text.starts_with('/') && !text.starts_with("//") } - pub fn parse_raw_uri(&self, raw_uri: &RawUri) -> Result { - match Uri::try_from(raw_uri.text.as_ref()) { + /// Parses the given URL text into a fully-qualified URL, including + /// resolving relative links if supported by the current [`SourceBaseInfo`]. + /// + /// # Errors + /// + /// Returns an error if the text is an invalid URL, or the text is a + /// relative link and this [`SourceBaseInfo`] variant cannot resolve + /// the relative link. + pub fn parse_url_text(&self, text: &str) -> Result { + match Uri::try_from(text.as_ref()) { Ok(Uri { url }) => Ok(url), Err(e @ ErrorKind::ParseUrl(_, _)) => match self { - _ if raw_uri.is_root_relative() && !self.supports_root_relative() => { + Self::NoRoot(_) if Self::is_root_relative(text) => { // TODO: report more errors if a --root-dir is specified but URL falls outside of // thingy - Err(ErrorKind::InvalidBaseJoin(raw_uri.text.clone())) + Err(ErrorKind::InvalidBaseJoin(text.to_string())) } - Self(Some((origin, subpath, _))) => origin - .join_rooted(&[subpath, &raw_uri.text]) - .map_err(|e| ErrorKind::ParseUrl(e, raw_uri.text.clone())), - Self(None) => Err(e), + Self::NoRoot(base) => base + .join_rooted(&[&text]) + .map_err(|e| ErrorKind::ParseUrl(e, text.to_string())), + Self::Full(origin, subpath) => origin + .join_rooted(&[subpath, &text]) + .map_err(|e| ErrorKind::ParseUrl(e, text.to_string())), + Self::None => Err(e), }, Err(e) => Err(e), } @@ -160,6 +196,45 @@ impl SourceBaseInfo { // - [`SourceBaseInfo::new`] fails. } +pub struct UrlMappings { + /// List of tuples of `old_url`, `new_url`. + mappings: Vec<(Url, Url)>, +} + +impl UrlMappings { + pub fn new(mappings: Vec<(Url, Url)>) -> Result { + // TODO: check no repeated bases/roots on the same side. + // TODO: choose longest match if multiple could apply + let conflicting_mapping = mappings.iter().find(|(remote, local)| { + if remote == local { + false + } else { + remote.strip_prefix(local).is_some() || local.strip_prefix(remote).is_some() + } + }); + + match conflicting_mapping { + Some((base, root)) => Err(ErrorKind::InvalidBase( + base.to_string(), + format!("base cannot be parent or child of root-dir {root}"), + )), + None => Ok(Self { mappings }), + } + } + + pub fn map_to_old_url(&self, url: &Url) -> Option<(&Url, String)> { + self.mappings + .iter() + .find_map(|(left, right)| url.strip_prefix(left).map(|subpath| (right, subpath))) + } + + pub fn map_to_new_url(&self, url: &Url) -> Option<(&Url, String)> { + self.mappings + .iter() + .find_map(|(left, right)| url.strip_prefix(right).map(|subpath| (left, subpath))) + } +} + pub fn prepare_source_base_info( source: &ResolvedInputSource, root_and_base: Option<(&Path, Option<&Base>)>, @@ -176,18 +251,18 @@ pub fn prepare_source_base_info( }; let fallback_base = match fallback_base.map(Base::to_url).transpose()? { - None => SourceBaseInfo::none(), - Some(fallback_url) => SourceBaseInfo::new(fallback_url, String::new(), true)?, + None => SourceBaseInfo::no_info(), + Some(fallback_url) => SourceBaseInfo::full_info(fallback_url, String::new()), }; let mappings = UrlMappings::new(root_and_base.into_iter().collect())?; let base_info = match source.to_url()? { Some(source_url) => match mappings.map_to_old_url(&source_url) { - Some((remote, subpath)) => SourceBaseInfo::new(remote.clone(), subpath, true)?, - None => SourceBaseInfo::infer_source_base(&source_url)?, + Some((remote, subpath)) => SourceBaseInfo::full_info(remote.clone(), subpath), + None => SourceBaseInfo::from_source_url(&source_url), }, - None => SourceBaseInfo::none(), + None => SourceBaseInfo::no_info(), }; let base_info = base_info.or_fallback(fallback_base); @@ -200,7 +275,7 @@ pub fn parse_url_with_base_info( mappings: &UrlMappings, raw_uri: &RawUri, ) -> Result { - let url = base_info.parse_raw_uri(raw_uri)?; + let url = base_info.parse_url_text(&raw_uri.text)?; let mut url = match mappings.map_to_new_url(&url) { Some((local, subpath)) => local.join(&subpath).ok(), diff --git a/lychee-lib/src/types/uri/raw.rs b/lychee-lib/src/types/uri/raw.rs index e5716b0dc4..026ee75821 100644 --- a/lychee-lib/src/types/uri/raw.rs +++ b/lychee-lib/src/types/uri/raw.rs @@ -21,15 +21,6 @@ pub struct RawUri { pub span: RawUriSpan, } -impl RawUri { - /// Returns whether the `RawUri` represents a relative link that is - /// relative to the domain root. Textually, it looks like `/this`. - pub fn is_root_relative(&self) -> bool { - let text = self.text.trim_ascii_start(); - text.starts_with('/') && !text.starts_with("//") - } -} - impl Display for RawUri { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{:?} (Attribute: {:?})", self.text, self.attribute) diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index eeec528479..163bb6fbc0 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -32,7 +32,7 @@ fn create_request( ) -> LycheeResult { // WARN: BROKEN because this needs to do all mapping. let uri = Uri { - url: base_info.parse_raw_uri(raw_uri)?, + url: base_info.parse_url_text(&raw_uri.text)?, }; let source = source.clone(); let element = raw_uri.element.clone(); From ae87f974531bbb5542513f72513a06e99857d449 Mon Sep 17 00:00:00 2001 From: rina Date: Sat, 24 Jan 2026 22:03:18 +1000 Subject: [PATCH 56/59] url_mapping separate --- .../types/{base_mapping.rs => base_info.rs} | 43 +------------------ lychee-lib/src/types/mod.rs | 6 ++- lychee-lib/src/types/url_mapping.rs | 43 +++++++++++++++++++ lychee-lib/src/utils/request.rs | 12 +++--- 4 files changed, 55 insertions(+), 49 deletions(-) rename lychee-lib/src/types/{base_mapping.rs => base_info.rs} (89%) create mode 100644 lychee-lib/src/types/url_mapping.rs diff --git a/lychee-lib/src/types/base_mapping.rs b/lychee-lib/src/types/base_info.rs similarity index 89% rename from lychee-lib/src/types/base_mapping.rs rename to lychee-lib/src/types/base_info.rs index 969a436364..f24a0aac6d 100644 --- a/lychee-lib/src/types/base_mapping.rs +++ b/lychee-lib/src/types/base_info.rs @@ -9,6 +9,7 @@ use crate::Base; use crate::ErrorKind; use crate::ResolvedInputSource; use crate::Uri; +use crate::types::UrlMappings; use crate::types::uri::raw::RawUri; use crate::utils::url::ReqwestUrlExt; use url::PathSegmentsMut; @@ -72,7 +73,6 @@ impl SourceBaseInfo { /// - A `file:` URL will yield [`SourceBaseInfo::NoRoot`]. /// - For other URLs, a [`SourceBaseInfo::Full`] will be constructed from the URL's /// origin and path. - /// pub fn from_source_url(url: &Url) -> Self { if url.scheme() == "file" { Self::NoRoot(url.clone()) @@ -129,7 +129,7 @@ impl SourceBaseInfo { /// /// # Errors /// - /// Returns an error if the text is an invalid URL, or the text is a + /// Returns an error if the text is an invalid URL, or if the text is a /// relative link and this [`SourceBaseInfo`] variant cannot resolve /// the relative link. pub fn parse_url_text(&self, text: &str) -> Result { @@ -196,45 +196,6 @@ impl SourceBaseInfo { // - [`SourceBaseInfo::new`] fails. } -pub struct UrlMappings { - /// List of tuples of `old_url`, `new_url`. - mappings: Vec<(Url, Url)>, -} - -impl UrlMappings { - pub fn new(mappings: Vec<(Url, Url)>) -> Result { - // TODO: check no repeated bases/roots on the same side. - // TODO: choose longest match if multiple could apply - let conflicting_mapping = mappings.iter().find(|(remote, local)| { - if remote == local { - false - } else { - remote.strip_prefix(local).is_some() || local.strip_prefix(remote).is_some() - } - }); - - match conflicting_mapping { - Some((base, root)) => Err(ErrorKind::InvalidBase( - base.to_string(), - format!("base cannot be parent or child of root-dir {root}"), - )), - None => Ok(Self { mappings }), - } - } - - pub fn map_to_old_url(&self, url: &Url) -> Option<(&Url, String)> { - self.mappings - .iter() - .find_map(|(left, right)| url.strip_prefix(left).map(|subpath| (right, subpath))) - } - - pub fn map_to_new_url(&self, url: &Url) -> Option<(&Url, String)> { - self.mappings - .iter() - .find_map(|(left, right)| url.strip_prefix(right).map(|subpath| (left, subpath))) - } -} - pub fn prepare_source_base_info( source: &ResolvedInputSource, root_and_base: Option<(&Path, Option<&Base>)>, diff --git a/lychee-lib/src/types/mod.rs b/lychee-lib/src/types/mod.rs index 50982378da..261bcbc9e3 100644 --- a/lychee-lib/src/types/mod.rs +++ b/lychee-lib/src/types/mod.rs @@ -2,7 +2,7 @@ mod accept; mod base; -pub(crate) mod base_mapping; +pub(crate) mod base_info; mod basic_auth; mod cache; mod cookies; @@ -19,10 +19,11 @@ mod response; mod status; mod status_code_selector; pub(crate) mod uri; +pub(crate) mod url_mapping; pub use accept::*; pub use base::Base; -pub use base_mapping::{SourceBaseInfo, UrlMappings}; +pub use base_info::SourceBaseInfo; pub use basic_auth::{BasicAuthCredentials, BasicAuthSelector}; pub use cache::CacheStatus; pub use cookies::CookieJar; @@ -36,6 +37,7 @@ pub use request_error::RequestError; pub use response::{Response, ResponseBody}; pub use status::Status; pub use status_code_selector::*; +pub use url_mapping::UrlMappings; /// The lychee `Result` type pub type Result = std::result::Result; diff --git a/lychee-lib/src/types/url_mapping.rs b/lychee-lib/src/types/url_mapping.rs new file mode 100644 index 0000000000..e5fa6f2b47 --- /dev/null +++ b/lychee-lib/src/types/url_mapping.rs @@ -0,0 +1,43 @@ +use reqwest::Url; +use crate::ErrorKind; +use crate::utils::url::ReqwestUrlExt; + +pub struct UrlMappings { + /// List of tuples of `old_url`, `new_url`. + mappings: Vec<(Url, Url)>, +} + +impl UrlMappings { + pub fn new(mappings: Vec<(Url, Url)>) -> Result { + // TODO: check no repeated bases/roots on the same side. + // TODO: choose longest match if multiple could apply + let conflicting_mapping = mappings.iter().find(|(remote, local)| { + if remote == local { + false + } else { + remote.strip_prefix(local).is_some() || local.strip_prefix(remote).is_some() + } + }); + + match conflicting_mapping { + Some((base, root)) => Err(ErrorKind::InvalidBase( + base.to_string(), + format!("base cannot be parent or child of root-dir {root}"), + )), + None => Ok(Self { mappings }), + } + } + + pub fn map_to_old_url(&self, url: &Url) -> Option<(&Url, String)> { + self.mappings + .iter() + .find_map(|(left, right)| url.strip_prefix(left).map(|subpath| (right, subpath))) + } + + pub fn map_to_new_url(&self, url: &Url) -> Option<(&Url, String)> { + self.mappings + .iter() + .find_map(|(left, right)| url.strip_prefix(right).map(|subpath| (left, subpath))) + } +} + diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 163bb6fbc0..d8bb4b77b5 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -6,7 +6,7 @@ use std::collections::HashSet; use std::path::{Path, PathBuf}; use crate::types::SourceBaseInfo; -use crate::types::base_mapping; +use crate::types::base_info; use crate::{ Base, BasicAuthCredentials, ErrorKind, LycheeResult, Request, RequestError, Uri, basic_auth::BasicAuthExtractor, @@ -52,13 +52,13 @@ fn try_parse_into_uri( ) -> LycheeResult { // HACK: if only base_url is specified, use that as a fallback_base_url. let (a, b) = match (root_dir, base) { - (None, base) => base_mapping::prepare_source_base_info(source, None, base), + (None, base) => base_info::prepare_source_base_info(source, None, base), (Some(root_dir), base) => { - base_mapping::prepare_source_base_info(source, Some((root_dir, base)), None) + base_info::prepare_source_base_info(source, Some((root_dir, base)), None) } }?; - base_mapping::parse_url_with_base_info(&a, &b, raw_uri) + base_info::parse_url_with_base_info(&a, &b, raw_uri) } // Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs @@ -115,7 +115,7 @@ pub(crate) fn create( // TODO: it would probably be nice to inline prepare_source_base_info into this function. // however, it uses a lot of `.?` and we need to catch and handle all those errors here. let (base_info, mappings) = - match base_mapping::prepare_source_base_info(source, root_and_base, fallback_base) { + match base_info::prepare_source_base_info(source, root_and_base, fallback_base) { Ok(base_info) => base_info, Err(e) => { // TODO: IMPORTANT! return an error inside this vec. @@ -128,7 +128,7 @@ pub(crate) fn create( let mut errors = Vec::::new(); for raw_uri in uris { - match base_mapping::parse_url_with_base_info(&base_info, &mappings, &raw_uri) { + match base_info::parse_url_with_base_info(&base_info, &mappings, &raw_uri) { Ok(uri) => { let source = source.clone(); let element = raw_uri.element.clone(); From 70b8352eaff6e86b9e5121168d8381ca900e9ef8 Mon Sep 17 00:00:00 2001 From: rina Date: Sat, 24 Jan 2026 22:38:54 +1000 Subject: [PATCH 57/59] docs --- lychee-lib/src/types/base_info.rs | 28 ++++++++++++--- lychee-lib/src/types/url_mapping.rs | 55 ++++++++++++++++++++++------- lychee-lib/src/utils/request.rs | 4 +-- lychee-lib/src/utils/url.rs | 43 ++++++++++++---------- 4 files changed, 93 insertions(+), 37 deletions(-) diff --git a/lychee-lib/src/types/base_info.rs b/lychee-lib/src/types/base_info.rs index f24a0aac6d..61db29f9b0 100644 --- a/lychee-lib/src/types/base_info.rs +++ b/lychee-lib/src/types/base_info.rs @@ -1,6 +1,5 @@ //! Parses and resolves [`RawUri`] into into fully-qualified [`Uri`] by //! applying base URL and root dir mappings. -//! use reqwest::Url; use std::path::Path; @@ -74,6 +73,7 @@ impl SourceBaseInfo { /// - For other URLs, a [`SourceBaseInfo::Full`] will be constructed from the URL's /// origin and path. pub fn from_source_url(url: &Url) -> Self { + // TODO: should we return error if a cannot_be_a_base is given? if url.scheme() == "file" { Self::NoRoot(url.clone()) } else { @@ -196,6 +196,17 @@ impl SourceBaseInfo { // - [`SourceBaseInfo::new`] fails. } +/// Prepares the needed structures to resolve links within a particular input source, +/// while handling roots and bases. +/// +/// This should be called once for each [`ResolvedInputSource`] being processed. +/// The result of this function should be used with [`parse_url_with_base_info`] +/// to parse and resolve URLs. +/// +/// # Errors +/// +/// Returns an error if converting any of the given arguments to a URL fails +/// unexpectedly. pub fn prepare_source_base_info( source: &ResolvedInputSource, root_and_base: Option<(&Path, Option<&Base>)>, @@ -213,7 +224,7 @@ pub fn prepare_source_base_info( let fallback_base = match fallback_base.map(Base::to_url).transpose()? { None => SourceBaseInfo::no_info(), - Some(fallback_url) => SourceBaseInfo::full_info(fallback_url, String::new()), + Some(fallback_url) => SourceBaseInfo::from_source_url(&fallback_url), }; let mappings = UrlMappings::new(root_and_base.into_iter().collect())?; @@ -226,17 +237,26 @@ pub fn prepare_source_base_info( None => SourceBaseInfo::no_info(), }; + // NOTE: using fallback base in this way lets it override non-rooted + // file:// bases. let base_info = base_info.or_fallback(fallback_base); Ok((base_info, mappings)) } +/// Parses and resolves the given URL text using the given base and mapping +/// information. +/// +/// # Errors +/// +/// Returns an error if the given text cannot be parsed as a URL, or if the +/// text parses as a relative URL and it cannot be resolved. pub fn parse_url_with_base_info( base_info: &SourceBaseInfo, mappings: &UrlMappings, - raw_uri: &RawUri, + text: &str, ) -> Result { - let url = base_info.parse_url_text(&raw_uri.text)?; + let url = base_info.parse_url_text(text)?; let mut url = match mappings.map_to_new_url(&url) { Some((local, subpath)) => local.join(&subpath).ok(), diff --git a/lychee-lib/src/types/url_mapping.rs b/lychee-lib/src/types/url_mapping.rs index e5fa6f2b47..453cfa54fb 100644 --- a/lychee-lib/src/types/url_mapping.rs +++ b/lychee-lib/src/types/url_mapping.rs @@ -1,21 +1,41 @@ -use reqwest::Url; +//! Mapping of URLs based on prefix matches of the URL's path structure. use crate::ErrorKind; use crate::utils::url::ReqwestUrlExt; +use reqwest::Url; +/// A collection of URL mappings which can be applied in either direction. +/// +/// Mappings are from URL to URL. A URL matches with a particular mapping +/// (and hence, the mapping will be applied) when the URL is a subpath +/// of the mapping source URL. Equivalently, this is when the URL has +/// a mapping's source URL as a prefix. +/// +/// Mappings are provided as pairs and the mapping can be interpreted in +/// either direction; the left URL can be mapped to the right, or +/// vice-versa. +/// +/// Despite this, we call the left side the "old URL" and the right side the +/// "new URL", since most uses will have _some_ level of directionality. +#[derive(Debug, PartialEq, Eq, Clone)] pub struct UrlMappings { /// List of tuples of `old_url`, `new_url`. mappings: Vec<(Url, Url)>, } impl UrlMappings { + /// Constructs a new [`UrlMappings`] from the given mappings. + /// + /// # Errors + /// + /// If any pair has a URL which is a subpath of its other URL. pub fn new(mappings: Vec<(Url, Url)>) -> Result { // TODO: check no repeated bases/roots on the same side. - // TODO: choose longest match if multiple could apply let conflicting_mapping = mappings.iter().find(|(remote, local)| { if remote == local { false } else { - remote.strip_prefix(local).is_some() || local.strip_prefix(remote).is_some() + remote.strictly_relative_to(local).is_some() + || local.strictly_relative_to(remote).is_some() } }); @@ -28,16 +48,27 @@ impl UrlMappings { } } - pub fn map_to_old_url(&self, url: &Url) -> Option<(&Url, String)> { - self.mappings - .iter() - .find_map(|(left, right)| url.strip_prefix(left).map(|subpath| (right, subpath))) + /// Matches the given URL against the old (left) URLs and + /// returns the new (right) URL of the first matched pair, if any. + /// + /// If matched, the returned option will contain a URL from the new + /// side of a mapping, along with the subpath of the given URL when + /// the corresponding old URL is removed from it. + pub fn map_to_new_url(&self, url: &Url) -> Option<(&Url, String)> { + // TODO: choose longest match if multiple could apply?? + self.mappings.iter().find_map(|(left, right)| { + url.strictly_relative_to(right) + .map(|subpath| (left, subpath)) + }) } - pub fn map_to_new_url(&self, url: &Url) -> Option<(&Url, String)> { - self.mappings - .iter() - .find_map(|(left, right)| url.strip_prefix(right).map(|subpath| (left, subpath))) + /// Like [`UrlMappings::map_to_new_url`] but in the reverse direction, + /// matching against the new URLs and returning the correponding + /// old URL of the matched mapping, if any. + pub fn map_to_old_url(&self, url: &Url) -> Option<(&Url, String)> { + self.mappings.iter().find_map(|(left, right)| { + url.strictly_relative_to(left) + .map(|subpath| (right, subpath)) + }) } } - diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index d8bb4b77b5..2322e90f94 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -58,7 +58,7 @@ fn try_parse_into_uri( } }?; - base_info::parse_url_with_base_info(&a, &b, raw_uri) + base_info::parse_url_with_base_info(&a, &b, &raw_uri.text) } // Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs @@ -128,7 +128,7 @@ pub(crate) fn create( let mut errors = Vec::::new(); for raw_uri in uris { - match base_info::parse_url_with_base_info(&base_info, &mappings, &raw_uri) { + match base_info::parse_url_with_base_info(&base_info, &mappings, &raw_uri.text) { Ok(uri) => { let source = source.clone(); let element = raw_uri.element.clone(); diff --git a/lychee-lib/src/utils/url.rs b/lychee-lib/src/utils/url.rs index b29d13a5e3..35a7cbf8e3 100644 --- a/lychee-lib/src/utils/url.rs +++ b/lychee-lib/src/utils/url.rs @@ -27,12 +27,12 @@ pub(crate) fn find_links(input: &str) -> impl Iterator> } pub(crate) trait ReqwestUrlExt { - fn strip_prefix(&self, prefix: &Url) -> Option; + fn strictly_relative_to(&self, prefix: &Url) -> Option; fn join_rooted(&self, subpaths: &[&str]) -> Result; } impl ReqwestUrlExt for Url { - fn strip_prefix(&self, prefix: &Url) -> Option { + fn strictly_relative_to(&self, prefix: &Url) -> Option { if self.scheme() != prefix.scheme() || self.authority() != prefix.authority() || self.port() != prefix.port() @@ -156,21 +156,26 @@ mod test_url_ext { } #[test] - fn test_strip_prefix() { + fn test_strictly_relative_to() { // note trailing slashes for subpaths, otherwise everything becomes siblings let goog = Url::parse("https://goog.com").unwrap(); let goog_subpath = goog.join("subpath/").unwrap(); let goog_subsubpath = goog_subpath.join("sub2path/").unwrap(); - assert_eq!(goog.strip_prefix(&goog).as_deref(), Some("")); + assert_eq!(goog.strictly_relative_to(&goog).as_deref(), Some("")); assert_eq!( - goog_subpath.strip_prefix(&goog).as_deref(), + goog_subpath.strictly_relative_to(&goog).as_deref(), Some("subpath/") ); - assert_eq!(goog.strip_prefix(&goog_subpath).as_deref(), None); + assert_eq!(goog.strictly_relative_to(&goog_subpath).as_deref(), None); - assert_eq!(goog_subpath.strip_prefix(&goog_subsubpath).as_deref(), None); + assert_eq!( + goog_subpath + .strictly_relative_to(&goog_subsubpath) + .as_deref(), + None + ); } #[test] @@ -178,19 +183,19 @@ mod test_url_ext { // exact match assert_eq!( url!("https://a.com/b/x") - .strip_prefix(&url!("https://a.com/b/x")) + .strictly_relative_to(&url!("https://a.com/b/x")) .as_deref(), Some("") ); assert_eq!( url!("https://a.com/b/") - .strip_prefix(&url!("https://a.com/b/")) + .strictly_relative_to(&url!("https://a.com/b/")) .as_deref(), Some("") ); assert_eq!( url!("https://a.com/b/x?a=2") - .strip_prefix(&url!("https://a.com/b/x?b=x")) + .strictly_relative_to(&url!("https://a.com/b/x?b=x")) .as_deref(), Some("?a=2") ); @@ -198,13 +203,13 @@ mod test_url_ext { // no matches due to / difference assert_eq!( url!("https://a.com/b") - .strip_prefix(&url!("https://a.com/b/")) + .strictly_relative_to(&url!("https://a.com/b/")) .as_deref(), None ); assert_eq!( url!("https://a.com/b/") - .strip_prefix(&url!("https://a.com/b")) + .strictly_relative_to(&url!("https://a.com/b")) .as_deref(), None ); @@ -212,7 +217,7 @@ mod test_url_ext { // changing filename leads to no match assert_eq!( url!("https://a.com/b/x") - .strip_prefix(&url!("https://a.com/b/aa")) + .strictly_relative_to(&url!("https://a.com/b/aa")) .as_deref(), None ); @@ -220,7 +225,7 @@ mod test_url_ext { // matching in subdir assert_eq!( url!("https://a.com/b/x") - .strip_prefix(&url!("https://a.com/b/")) + .strictly_relative_to(&url!("https://a.com/b/")) .as_deref(), Some("x") ); @@ -228,19 +233,19 @@ mod test_url_ext { // no match assert_eq!( url!("https://a.com/b/x") - .strip_prefix(&url!("https://a.com/b")) + .strictly_relative_to(&url!("https://a.com/b")) .as_deref(), None ); assert_eq!( url!("https://a.com/b/x") - .strip_prefix(&url!("https://a.com/a")) + .strictly_relative_to(&url!("https://a.com/a")) .as_deref(), None ); assert_eq!( url!("https://a.com/b/x") - .strip_prefix(&url!("https://a.com/a/")) + .strictly_relative_to(&url!("https://a.com/a/")) .as_deref(), None ); @@ -248,13 +253,13 @@ mod test_url_ext { // matches and maintains extra ./ inside url. assert_eq!( url!("https://a.com/b//x") - .strip_prefix(&url!("https://a.com/b/")) + .strictly_relative_to(&url!("https://a.com/b/")) .as_deref(), Some("./x") ); assert_eq!( url!("https://a.com/b///x") - .strip_prefix(&url!("https://a.com/b/")) + .strictly_relative_to(&url!("https://a.com/b/")) .as_deref(), Some(".//x") ); From 8638e1aa226abc0ae0ed238d46ae0e0ec36535a5 Mon Sep 17 00:00:00 2001 From: rina Date: Sun, 25 Jan 2026 00:08:18 +1000 Subject: [PATCH 58/59] add hacky root-dir for pre-porting --- lychee-lib/src/types/base_info.rs | 58 +++++++++++++++++++++++-------- lychee-lib/src/utils/request.rs | 2 +- 2 files changed, 45 insertions(+), 15 deletions(-) diff --git a/lychee-lib/src/types/base_info.rs b/lychee-lib/src/types/base_info.rs index 61db29f9b0..eae7a1b0f5 100644 --- a/lychee-lib/src/types/base_info.rs +++ b/lychee-lib/src/types/base_info.rs @@ -77,20 +77,34 @@ impl SourceBaseInfo { if url.scheme() == "file" { Self::NoRoot(url.clone()) } else { - let mut origin = url.clone(); + match Self::split_url_origin_and_path(url) { + None => Self::no_info(), + Some((origin, path)) => Self::full_info(origin, path), + } + } + } - match origin.path_segments_mut() { - Ok(mut segments) => segments.clear(), - Err(()) => return Self::no_info(), - }; + fn split_url_origin_and_path(url: &Url) -> Option<(Url, String)> { + let origin = url.join("/").ok()?; + let subpath = origin.make_relative(&url)?; + Some((origin, subpath)) + } - let path = match url.path().strip_prefix('/') { - Some(path) => path.to_string(), - None => return Self::no_info(), - }; + /// If this is a [`SourceBaseInfo::NoRoot`], promote it to a [`SourceBaseInfo::Full`] + /// by using the filesystem root as the "origin" for root-relative links. + /// + /// Generally, this function should be avoided in favour of a more explicit + /// user-provided root directory. The filesystem root is rarely a good place + /// to look for files. + /// + /// Makes no change to other [`SourceBaseInfo`] variants. + pub fn use_fs_root_as_origin(self) -> Self { + let Self::NoRoot(url) = self else { return self }; - Self::Full(origin, path) - } + let (fs_root, subpath) = Self::split_url_origin_and_path(&url) + .expect("splitting up a NoRoot file:// URL should work"); + + Self::full_info(fs_root, subpath) } pub fn supports_root_relative(&self) -> bool { @@ -132,8 +146,8 @@ impl SourceBaseInfo { /// Returns an error if the text is an invalid URL, or if the text is a /// relative link and this [`SourceBaseInfo`] variant cannot resolve /// the relative link. - pub fn parse_url_text(&self, text: &str) -> Result { - match Uri::try_from(text.as_ref()) { + pub fn parse_url_text(&self, text: &str, root_dir: Option<&Url>) -> Result { + let url = match Uri::try_from(text.as_ref()) { Ok(Uri { url }) => Ok(url), Err(e @ ErrorKind::ParseUrl(_, _)) => match self { Self::NoRoot(_) if Self::is_root_relative(text) => { @@ -150,6 +164,22 @@ impl SourceBaseInfo { Self::None => Err(e), }, Err(e) => Err(e), + }?; + + // if a root-relative link resulted in a file:// URL, then prefix + // this with root-dir. doing this after parsing prevents a `/../` + // link from traversing outside the root-dir. + if let Some(root_dir) = root_dir + && Self::is_root_relative(text) + && url.scheme() == "file" + { + let (_, subpath) = + Self::split_url_origin_and_path(&url).expect("file:// URL can be split"); + root_dir + .join(&subpath) + .map_err(|e| ErrorKind::ParseUrl(e, text.to_string())) + } else { + Ok(url) } } @@ -256,7 +286,7 @@ pub fn parse_url_with_base_info( mappings: &UrlMappings, text: &str, ) -> Result { - let url = base_info.parse_url_text(text)?; + let url = base_info.parse_url_text(text, None)?; let mut url = match mappings.map_to_new_url(&url) { Some((local, subpath)) => local.join(&subpath).ok(), diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 2322e90f94..919054b854 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -32,7 +32,7 @@ fn create_request( ) -> LycheeResult { // WARN: BROKEN because this needs to do all mapping. let uri = Uri { - url: base_info.parse_url_text(&raw_uri.text)?, + url: base_info.parse_url_text(&raw_uri.text, None)?, }; let source = source.clone(); let element = raw_uri.element.clone(); From c2d74585792c97ecf5745b70d739a7b35338dc02 Mon Sep 17 00:00:00 2001 From: rina Date: Sun, 25 Jan 2026 00:29:09 +1000 Subject: [PATCH 59/59] more root dir fudging --- lychee-lib/src/types/base_info.rs | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/lychee-lib/src/types/base_info.rs b/lychee-lib/src/types/base_info.rs index eae7a1b0f5..0997b3277c 100644 --- a/lychee-lib/src/types/base_info.rs +++ b/lychee-lib/src/types/base_info.rs @@ -2,6 +2,7 @@ //! applying base URL and root dir mappings. use reqwest::Url; +use std::borrow::Cow; use std::path::Path; use crate::Base; @@ -147,18 +148,25 @@ impl SourceBaseInfo { /// relative link and this [`SourceBaseInfo`] variant cannot resolve /// the relative link. pub fn parse_url_text(&self, text: &str, root_dir: Option<&Url>) -> Result { + // HACK: if root-dir is specified, apply it by fudging around with + // file:// URLs. also see bottom of this function. + let fake_base_info = match root_dir { + Some(_) => Cow::Owned(self.clone().use_fs_root_as_origin()), + None => Cow::Borrowed(self), + }; + let url = match Uri::try_from(text.as_ref()) { Ok(Uri { url }) => Ok(url), - Err(e @ ErrorKind::ParseUrl(_, _)) => match self { + Err(e @ ErrorKind::ParseUrl(_, _)) => match *fake_base_info { Self::NoRoot(_) if Self::is_root_relative(text) => { // TODO: report more errors if a --root-dir is specified but URL falls outside of // thingy Err(ErrorKind::InvalidBaseJoin(text.to_string())) } - Self::NoRoot(base) => base + Self::NoRoot(ref base) => base .join_rooted(&[&text]) .map_err(|e| ErrorKind::ParseUrl(e, text.to_string())), - Self::Full(origin, subpath) => origin + Self::Full(ref origin, ref subpath) => origin .join_rooted(&[subpath, &text]) .map_err(|e| ErrorKind::ParseUrl(e, text.to_string())), Self::None => Err(e),