diff --git a/Cargo.lock b/Cargo.lock index 53bdcf6024..bc30b08c33 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2865,6 +2865,7 @@ dependencies = [ "toml", "typed-builder", "url", + "walkdir", "wiremock", ] diff --git a/README.md b/README.md index 924b3eac4d..02129489a7 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ Available as a command-line utility, a library and a [GitHub Action](https://git + ## Table of Contents - [Development](#development) @@ -573,7 +574,7 @@ Options: Find links in verbatim sections like `pre`- and `code` blocks --include-wikilinks - Check WikiLinks in Markdown files + Check WikiLinks in Markdown files, this requires specifying --base-url --index-files When checking locally, resolves directory links to a separate index file. diff --git a/fixtures/wiki/Dash-Usage.md b/fixtures/wiki/Dash-Usage.md new file mode 100644 index 0000000000..6c67b6a977 --- /dev/null +++ b/fixtures/wiki/Dash-Usage.md @@ -0,0 +1 @@ +# Header diff --git a/fixtures/wiki/Non-existent.md b/fixtures/wiki/Non-existent.md new file mode 100644 index 0000000000..45ed48a5eb --- /dev/null +++ b/fixtures/wiki/Non-existent.md @@ -0,0 +1,5 @@ +# Links to non-existing Files + +[[Does not exist]] +[[Doesn't exist.md]] +[[Does_not_exist]] diff --git a/fixtures/wiki/Space Usage.md b/fixtures/wiki/Space Usage.md new file mode 100644 index 0000000000..6c67b6a977 --- /dev/null +++ b/fixtures/wiki/Space Usage.md @@ -0,0 +1 @@ +# Header diff --git a/fixtures/wiki/Underscore_Usage.md b/fixtures/wiki/Underscore_Usage.md new file mode 100644 index 0000000000..6c67b6a977 --- /dev/null +++ b/fixtures/wiki/Underscore_Usage.md @@ -0,0 +1 @@ +# Header diff --git a/fixtures/wiki/Usage.md b/fixtures/wiki/Usage.md new file mode 100644 index 0000000000..6c67b6a977 --- /dev/null +++ b/fixtures/wiki/Usage.md @@ -0,0 +1 @@ +# Header diff --git a/fixtures/wiki/obsidian-style-plus-headers.md b/fixtures/wiki/obsidian-style-plus-headers.md new file mode 100644 index 0000000000..0b892aafc7 --- /dev/null +++ b/fixtures/wiki/obsidian-style-plus-headers.md @@ -0,0 +1,8 @@ +[[#LocalHeader]] + +# LocalHeader + +[[Usage#Header|HeaderRenaming]] +[[Space Usage#Header|HeaderRenaming]] +[[Space Usage DifferentDirectory#Header|HeaderRenaming]] +[[DifferentDirectory#Header|HeaderRenaming]] diff --git a/fixtures/wiki/obsidian-style.md b/fixtures/wiki/obsidian-style.md new file mode 100644 index 0000000000..4911206397 --- /dev/null +++ b/fixtures/wiki/obsidian-style.md @@ -0,0 +1,4 @@ +[[Usage]] +[[Space Usage]] +[[Space Usage DifferentDirectory]] +[[DifferentDirectory]] diff --git a/fixtures/wiki/subdirectory/Different-Directory-Dash.md b/fixtures/wiki/subdirectory/Different-Directory-Dash.md new file mode 100644 index 0000000000..6c67b6a977 --- /dev/null +++ b/fixtures/wiki/subdirectory/Different-Directory-Dash.md @@ -0,0 +1 @@ +# Header diff --git a/fixtures/wiki/subdirectory/DifferentDirectory.md b/fixtures/wiki/subdirectory/DifferentDirectory.md new file mode 100644 index 0000000000..6c67b6a977 --- /dev/null +++ b/fixtures/wiki/subdirectory/DifferentDirectory.md @@ -0,0 +1 @@ +# Header diff --git a/fixtures/wiki/subdirectory/Different_Directory_Underscore.md b/fixtures/wiki/subdirectory/Different_Directory_Underscore.md new file mode 100644 index 0000000000..6c67b6a977 --- /dev/null +++ b/fixtures/wiki/subdirectory/Different_Directory_Underscore.md @@ -0,0 +1 @@ +# Header diff --git a/fixtures/wiki/subdirectory/Space Usage DifferentDirectory.md b/fixtures/wiki/subdirectory/Space Usage DifferentDirectory.md new file mode 100644 index 0000000000..6c67b6a977 --- /dev/null +++ b/fixtures/wiki/subdirectory/Space Usage DifferentDirectory.md @@ -0,0 +1 @@ +# Header diff --git a/fixtures/wiki/wikilink-style.md b/fixtures/wiki/wikilink-style.md new file mode 100644 index 0000000000..61479263a2 --- /dev/null +++ b/fixtures/wiki/wikilink-style.md @@ -0,0 +1,19 @@ +[[#LocalHeader]] + +[[Usage]] +[[Space Usage]] +[[Dash Usage]] +[[Underscore Usage]] +[[DifferentDirectory]] +[[Different Directory Dash]] +[[Different Directory Underscore]] + +[[Usage#Header|HeaderRenaming]] +[[Space Usage#Header|HeaderRenaming]] +[[Dash Usage#Header|HeaderRenaming]] +[[Underscore Usage#Header|HeaderRenaming]] +[[DifferentDirectory#Header|HeaderRenaming]] +[[Different Directory Dash#Header|HeaderRenaming]] +[[Different Directory Underscore#Header|HeaderRenaming]] + +# LocalHeader diff --git a/lychee-bin/src/client.rs b/lychee-bin/src/client.rs index 3efefffe6f..d1b8c578ac 100644 --- a/lychee-bin/src/client.rs +++ b/lychee-bin/src/client.rs @@ -55,6 +55,7 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc>) - .include_fragments(cfg.include_fragments) .fallback_extensions(cfg.fallback_extensions.clone()) .index_files(cfg.index_files.clone()) + .include_wikilinks(cfg.include_wikilinks) .rate_limit_config(RateLimitConfig::from_options( cfg.host_concurrency, cfg.host_request_interval, diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index b9be8fe521..1f4677e918 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -891,7 +891,8 @@ and existing cookies will be updated." pub(crate) cookie_jar: Option, #[allow(clippy::doc_markdown)] - /// Check WikiLinks in Markdown files + /// Check WikiLinks in Markdown files, this requires specifying --base-url + #[clap(requires = "base_url")] #[arg(long)] #[serde(default)] pub(crate) include_wikilinks: bool, diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index c0cae571d0..6f43752f75 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -2586,6 +2586,8 @@ The config file should contain every possible key for documentation purposes." cargo_bin_cmd!() .arg("--dump") .arg("--include-wikilinks") + .arg("--base-url") + .arg(fixtures_path!()) .arg(test_path) .assert() .success() @@ -3046,6 +3048,74 @@ The config file should contain every possible key for documentation purposes." .stdout(contains("https://example.org")); // Should extract the link as plaintext } + #[test] + fn test_wikilink_fixture_obsidian_style() { + let input = fixtures_path!().join("wiki/obsidian-style.md"); + + // testing without fragments should not yield failures + cargo_bin_cmd!() + .arg(&input) + .arg("--include-wikilinks") + .arg("--fallback-extensions") + .arg("md") + .arg("--base-url") + .arg(fixtures_path!()) + .assert() + .success() + .stdout(contains("4 OK")); + } + + #[test] + fn test_wikilink_fixture_wikilink_non_existent() { + let input = fixtures_path!().join("wiki/Non-existent.md"); + + cargo_bin_cmd!() + .arg(&input) + .arg("--include-wikilinks") + .arg("--fallback-extensions") + .arg("md") + .arg("--base-url") + .arg(fixtures_path!()) + .assert() + .failure() + .stdout(contains("3 Errors")); + } + + #[test] + fn test_wikilink_fixture_with_fragments_obsidian_style_fixtures_excluded() { + let input = fixtures_path!().join("wiki/obsidian-style-plus-headers.md"); + + // fragments should resolve all headers + cargo_bin_cmd!() + .arg(&input) + .arg("--include-wikilinks") + .arg("--fallback-extensions") + .arg("md") + .arg("--base-url") + .arg(fixtures_path!()) + .assert() + .success() + .stdout(contains("4 OK")); + } + + #[test] + fn test_wikilink_fixture_with_fragments_obsidian_style() { + let input = fixtures_path!().join("wiki/obsidian-style-plus-headers.md"); + + // fragments should resolve all headers + cargo_bin_cmd!() + .arg(&input) + .arg("--include-wikilinks") + .arg("--include-fragments") + .arg("--fallback-extensions") + .arg("md") + .arg("--base-url") + .arg(fixtures_path!()) + .assert() + .success() + .stdout(contains("4 OK")); + } + /// An input which matches nothing should print a warning and continue. #[test] fn test_input_matching_nothing_warns() -> Result<()> { diff --git a/lychee-lib/Cargo.toml b/lychee-lib/Cargo.toml index 0c94b62ca5..cddd684970 100644 --- a/lychee-lib/Cargo.toml +++ b/lychee-lib/Cargo.toml @@ -64,6 +64,7 @@ tokio = { version = "1.48.0", features = ["full"] } toml = "0.9.10" typed-builder = "0.23.2" url = { version = "2.5.7", features = ["serde"] } +walkdir = "2.5.0" [dependencies.par-stream] version = "0.10.2" diff --git a/lychee-lib/src/checker/file.rs b/lychee-lib/src/checker/file.rs index 122eadf7f8..f60f3bf9eb 100644 --- a/lychee-lib/src/checker/file.rs +++ b/lychee-lib/src/checker/file.rs @@ -3,8 +3,9 @@ use log::warn; use std::borrow::Cow; use std::path::{Path, PathBuf}; +use crate::checker::wikilink::resolver::WikilinkResolver; use crate::{ - Base, ErrorKind, Status, Uri, + Base, ErrorKind, Result, Status, Uri, utils::fragment_checker::{FragmentChecker, FragmentInput}, }; @@ -34,6 +35,8 @@ pub(crate) struct FileChecker { include_fragments: bool, /// Utility for performing fragment checks in HTML files. fragment_checker: FragmentChecker, + /// Utility for optionally resolving Wikilinks. + wikilink_resolver: Option, } impl FileChecker { @@ -45,19 +48,35 @@ impl FileChecker { /// * `fallback_extensions` - List of extensions to try if the original file is not found. /// * `index_files` - Optional list of index file names to search for if the path is a directory. /// * `include_fragments` - Whether to check for fragment existence in HTML files. + /// * `include_wikilinks` - Whether to check the existence of Wikilinks found in Markdown files . + /// + /// # Errors + /// + /// Fails if an invalid `base` is provided when including wikilinks. pub(crate) fn new( base: Option, fallback_extensions: Vec, index_files: Option>, include_fragments: bool, - ) -> Self { - Self { + include_wikilinks: bool, + ) -> Result { + let wikilink_resolver = if include_wikilinks { + Some(WikilinkResolver::new( + base.as_ref(), + fallback_extensions.clone(), + )?) + } else { + None + }; + + Ok(Self { base, fallback_extensions, index_files, include_fragments, fragment_checker: FragmentChecker::new(), - } + wikilink_resolver, + }) } /// Checks the given file URI for existence and validity. @@ -127,16 +146,20 @@ impl FileChecker { /// Returns `Ok` with the resolved path if it is valid, otherwise returns /// `Err` with an appropriate error. The returned path, if any, is guaranteed /// to exist and may be a file or a directory. - fn resolve_local_path<'a>( - &self, - path: &'a Path, - uri: &Uri, - ) -> Result, ErrorKind> { + fn resolve_local_path<'a>(&self, path: &'a Path, uri: &Uri) -> Result> { let path = match path.metadata() { // for non-existing paths, attempt fallback extensions - Err(e) if e.kind() == std::io::ErrorKind::NotFound => { - self.apply_fallback_extensions(path, uri).map(Cow::Owned) - } + // if fallback extensions don't help, try wikilinks + Err(e) if e.kind() == std::io::ErrorKind::NotFound => self + .apply_fallback_extensions(path, uri) + .or_else(|_| { + if let Some(resolver) = &self.wikilink_resolver { + resolver.resolve(path, uri) + } else { + Err(ErrorKind::InvalidFilePath(uri.clone())) + } + }) + .map(Cow::Owned), // other IO errors are unexpected and should fail the check Err(e) => Err(ErrorKind::ReadFileInput(e, path.to_path_buf())), @@ -181,7 +204,7 @@ impl FileChecker { /// /// Returns `Ok(PathBuf)` with the resolved file path, or `Err` if no valid file is found. /// If `Ok` is returned, the contained `PathBuf` is guaranteed to exist and be a file. - fn apply_fallback_extensions(&self, path: &Path, uri: &Uri) -> Result { + fn apply_fallback_extensions(&self, path: &Path, uri: &Uri) -> Result { // If it's already a file, use it directly if path.is_file() { return Ok(path.to_path_buf()); @@ -221,7 +244,7 @@ impl FileChecker { /// is guaranteed to exist. In most cases, the returned path will be a file path. /// /// If index files are disabled, simply returns `Ok(dir_path)`. - fn apply_index_files(&self, dir_path: &Path) -> Result { + fn apply_index_files(&self, dir_path: &Path) -> Result { // this implements the "disabled" case by treating a directory as its // own index file. let index_names_to_try = match &self.index_files { @@ -372,7 +395,7 @@ mod tests { #[tokio::test] async fn test_default() { // default behaviour accepts dir links as long as the directory exists. - let checker = FileChecker::new(None, vec![], None, true); + let checker = FileChecker::new(None, vec![], None, true, false).unwrap(); assert_filecheck!(&checker, "filechecker/index_dir", Status::Ok(_)); @@ -430,7 +453,9 @@ mod tests { vec![], Some(vec!["index.html".to_owned(), "index.md".to_owned()]), true, - ); + false, + ) + .unwrap(); assert_resolves!( &checker, @@ -468,7 +493,9 @@ mod tests { vec!["html".to_owned()], Some(vec!["index".to_owned()]), false, - ); + false, + ) + .unwrap(); // this test case has a subdir 'same_name' and a file 'same_name.html'. // this shows that the index file resolving is applied in this case and @@ -492,7 +519,8 @@ mod tests { #[tokio::test] async fn test_empty_index_list_corner() { // empty index_files list will reject all directory links - let checker_no_indexes = FileChecker::new(None, vec![], Some(vec![]), false); + let checker_no_indexes = + FileChecker::new(None, vec![], Some(vec![]), false, false).unwrap(); assert_resolves!( &checker_no_indexes, "filechecker/index_dir", @@ -516,7 +544,8 @@ mod tests { "..".to_owned(), "/".to_owned(), ]; - let checker_dir_indexes = FileChecker::new(None, vec![], Some(dir_names), false); + let checker_dir_indexes = + FileChecker::new(None, vec![], Some(dir_names), false, false).unwrap(); assert_resolves!( &checker_dir_indexes, "filechecker/index_dir", @@ -537,7 +566,9 @@ mod tests { vec![], Some(vec!["../index_dir/index.html".to_owned()]), true, - ); + false, + ) + .unwrap(); assert_resolves!( &checker_dotdot, "filechecker/empty_dir#fragment", @@ -550,7 +581,8 @@ mod tests { .to_str() .expect("expected utf-8 fixtures path") .to_owned(); - let checker_absolute = FileChecker::new(None, vec![], Some(vec![absolute_html]), true); + let checker_absolute = + FileChecker::new(None, vec![], Some(vec![absolute_html]), true, false).unwrap(); assert_resolves!( &checker_absolute, "filechecker/empty_dir#fragment", @@ -560,7 +592,7 @@ mod tests { #[tokio::test] async fn test_fallback_extensions_on_directories() { - let checker = FileChecker::new(None, vec!["html".to_owned()], None, true); + let checker = FileChecker::new(None, vec!["html".to_owned()], None, true, false).unwrap(); // fallback extensions should be applied when directory links are resolved // to directories (i.e., the default index_files behavior or if `.` diff --git a/lychee-lib/src/checker/mod.rs b/lychee-lib/src/checker/mod.rs index bfbef9de51..803b1aa9ea 100644 --- a/lychee-lib/src/checker/mod.rs +++ b/lychee-lib/src/checker/mod.rs @@ -5,3 +5,4 @@ pub(crate) mod file; pub(crate) mod mail; pub(crate) mod website; +pub(crate) mod wikilink; diff --git a/lychee-lib/src/checker/wikilink/index.rs b/lychee-lib/src/checker/wikilink/index.rs new file mode 100644 index 0000000000..03ce53f78f --- /dev/null +++ b/lychee-lib/src/checker/wikilink/index.rs @@ -0,0 +1,64 @@ +use log::info; +use std::collections::HashMap; +use std::ffi::OsString; +use std::path::Path; +use std::sync::Mutex; +use std::{path::PathBuf, sync::Arc}; +use walkdir::WalkDir; + +/// Indexes a given directory mapping filenames to their corresponding path. +/// +/// The `WikilinkIndex` recursively checks all subdirectories of the given +/// base directory mapping any found files to the path where they can be found. +/// Symlinks are ignored to prevent it from infinite loops. +#[derive(Clone, Debug)] +pub(crate) struct WikilinkIndex { + filenames: Arc>>, + /// Local base directory + local_base: PathBuf, +} + +impl WikilinkIndex { + pub(crate) fn new(local_base: PathBuf) -> Self { + let index = Self { + local_base, + filenames: Arc::new(Mutex::new(HashMap::new())), + }; + index.start_indexing(); + index + } + + /// Populates the index of the `WikilinkIndex` on startup by walking + /// the local base directory, mapping each filename to an absolute filepath. + pub(crate) fn start_indexing(&self) { + // Start file indexing only if the Base is valid and local + info!( + "Starting file indexing for wikilinks in {}", + self.local_base.display() + ); + + for entry in WalkDir::new(&self.local_base) + // actively ignore symlinks + .follow_links(false) + .into_iter() + .filter_map(Result::ok) + { + if let Some(filename) = entry.path().file_name() { + self.filenames + .lock() + .unwrap() + .insert(filename.to_os_string(), entry.path().to_path_buf()); + } + } + } + + /// Checks the index for a filename. Returning the absolute path if the name is found, + /// otherwise returning None + pub(crate) fn contains_path(&self, path: &Path) -> Option { + self.filenames + .lock() + .unwrap() + .get(path.file_name()?) + .cloned() + } +} diff --git a/lychee-lib/src/checker/wikilink/mod.rs b/lychee-lib/src/checker/wikilink/mod.rs new file mode 100644 index 0000000000..e11b2dcfa3 --- /dev/null +++ b/lychee-lib/src/checker/wikilink/mod.rs @@ -0,0 +1,98 @@ +//! `WikiLink` Module +//! +//! This module contains a Indexer and a Resolver for `WikiLinks` +//! The Indexer recursively indexes the subdirectories and files in a given base-directory mapping +//! the filename to the full path +//! The Resolver looks for found `WikiLinks` in the Index thus resolving the `WikiLink` to a full +//! filepath + +pub(crate) mod index; +pub(crate) mod resolver; + +use crate::ErrorKind; +use pulldown_cmark::CowStr; + +/// In Markdown Links both '#' and '|' act as modifiers +/// '#' links to a specific Header in a file +/// '|' is used to modify the link name, a so called "pothole" +const MARKDOWN_FRAGMENT_MARKER: char = '#'; +const MARKDOWN_POTHOLE_MARKER: char = '|'; + +/// Clean a `WikiLink` by removing potholes and fragments from a `&str` +pub(crate) fn wikilink(input: &str, has_pothole: bool) -> Result, ErrorKind> { + // Strip pothole marker (|) and pothole (text after marker) from wikilinks + let mut stripped_input = if has_pothole { + pulldown_cmark::CowStr::Borrowed( + &input[0..input.find(MARKDOWN_POTHOLE_MARKER).unwrap_or(input.len())], + ) + } else { + CowStr::Borrowed(input) + }; + + // Strip fragments (#) from wikilinks, according to the obsidian spec + // fragments always come before potholes + // https://help.obsidian.md/links#Change+the+link+display+text + if stripped_input.contains(MARKDOWN_FRAGMENT_MARKER) { + stripped_input = pulldown_cmark::CowStr::Borrowed( + // In theory a second '#' could be inserted into the pothole, so searching for the + // first occurrence from the left should yield the correct result + &input[0..input.find(MARKDOWN_FRAGMENT_MARKER).unwrap_or(input.len())], + ); + } + if stripped_input.is_empty() { + return Err(ErrorKind::EmptyUrl); + } + Ok(stripped_input) +} + +#[cfg(test)] +mod tests { + use pulldown_cmark::CowStr; + use rstest::rstest; + + use crate::checker::wikilink::wikilink; + + // All these Links are missing the targetname itself but contain valid fragment- and + // pothole-modifications. They would be parsed as an empty Link + #[rstest] + #[case("|foo", true)] + #[case("|foo#bar", true)] + #[case("|foo#bar|foo#bar", true)] + #[case("#baz", false)] + #[case("#baz#baz|foo", false)] + fn test_empty_wikilinks_are_detected(#[case] input: &str, #[case] has_pothole: bool) { + let result = wikilink(input, has_pothole); + assert!(result.is_err()); + } + + #[rstest] + #[case("link with spaces", true, "link with spaces")] + #[case("foo.fileextension", true, "foo.fileextension")] + #[case("specialcharacters !_@$&(){}", true, "specialcharacters !_@$&(){}")] + fn test_valid_wikilinks(#[case] input: &str, #[case] has_pothole: bool, #[case] actual: &str) { + let result = wikilink(input, has_pothole).unwrap(); + let actual = CowStr::Borrowed(actual); + assert_eq!(result, actual); + } + + #[rstest] + #[case("foo|bar", true, "foo")] + #[case("foo#bar", true, "foo")] + #[case("foo#bar|baz", false, "foo")] + #[case("foo#bar|baz#hashtag_in_pothole", false, "foo")] + #[case("foo with spaces#bar|baz#hashtag_in_pothole", false, "foo with spaces")] + #[case( + "specialcharacters !_@$&(){}#bar|baz#hashtag_in_pothole", + true, + "specialcharacters !_@$&(){}" + )] + fn test_fragment_and_pothole_removal( + #[case] input: &str, + #[case] has_pothole: bool, + #[case] actual: &str, + ) { + let result = wikilink(input, has_pothole).unwrap(); + let actual = CowStr::Borrowed(actual); + assert_eq!(result, actual); + } +} diff --git a/lychee-lib/src/checker/wikilink/resolver.rs b/lychee-lib/src/checker/wikilink/resolver.rs new file mode 100644 index 0000000000..247f3cc674 --- /dev/null +++ b/lychee-lib/src/checker/wikilink/resolver.rs @@ -0,0 +1,88 @@ +use crate::{Base, ErrorKind, Uri, checker::wikilink::index::WikilinkIndex}; +use std::path::{Path, PathBuf}; + +#[derive(Clone, Debug)] +pub(crate) struct WikilinkResolver { + checker: WikilinkIndex, + fallback_extensions: Vec, +} + +/// Tries to resolve a `WikiLink` by searching for the filename in the `WikilinkIndex` +/// Returns the path of the found file if found, otherwise an Error +impl WikilinkResolver { + /// # Errors + /// + /// Fails if `base` is not `Some(Base::Local(_))`. + pub(crate) fn new( + base: Option<&Base>, + fallback_extensions: Vec, + ) -> Result { + let base = match base { + None => Err(ErrorKind::WikilinkInvalidBase( + "Base must be specified for wikilink checking".into(), + ))?, + Some(base) => match base { + Base::Local(p) => p, + Base::Remote(_) => Err(ErrorKind::WikilinkInvalidBase( + "Base cannot be remote".to_string(), + ))?, + }, + }; + + Ok(Self { + checker: WikilinkIndex::new(base.clone()), + fallback_extensions, + }) + } + /// Resolves a wikilink by searching the index with fallback extensions. + pub(crate) fn resolve(&self, path: &Path, uri: &Uri) -> Result { + for ext in &self.fallback_extensions { + let mut candidate = path.to_path_buf(); + candidate.set_extension(ext); + + if let Some(resolved) = self.checker.contains_path(&candidate) { + return Ok(resolved); + } + } + + Err(ErrorKind::WikilinkNotFound(uri.clone(), path.to_path_buf())) + } +} + +#[cfg(test)] +mod tests { + use crate::{Base, ErrorKind, Uri, checker::wikilink::resolver::WikilinkResolver}; + use test_utils::{fixture_uri, fixtures_path}; + + #[test] + fn test_wikilink_resolves_to_filename() { + let resolver = WikilinkResolver::new( + Some(&Base::Local(fixtures_path!().join("wiki"))), + vec!["md".to_string()], + ) + .unwrap(); + let uri = Uri { + url: fixture_uri!("wiki/Usage"), + }; + let path = fixtures_path!().join("Usage"); + let expected_result = fixtures_path!().join("wiki/Usage.md"); + assert_eq!(resolver.resolve(&path, &uri), Ok(expected_result)); + } + + #[test] + fn test_wikilink_not_found() { + let resolver = WikilinkResolver::new( + Some(&Base::Local(fixtures_path!().join("wiki"))), + vec!["md".to_string()], + ) + .unwrap(); + let uri = Uri { + url: fixture_uri!("wiki/404"), + }; + let path = fixtures_path!().join("404"); + assert!(matches!( + resolver.resolve(&path, &uri), + Err(ErrorKind::WikilinkNotFound(..)) + )); + } +} diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index e0ec4e6c5c..31c2480257 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -300,6 +300,10 @@ pub struct ClientBuilder { /// Enable the checking of fragments in links. include_fragments: bool, + /// Enable the checking of wikilinks in markdown files. + /// Note that base must not be `None` if you set this `true`. + include_wikilinks: bool, + /// Requests run through this chain where each item in the chain /// can modify the request. A chained item can also decide to exit /// early and return a status, so that subsequent chain items are @@ -398,7 +402,8 @@ impl ClientBuilder { self.fallback_extensions, self.index_files, self.include_fragments, - ), + self.include_wikilinks, + )?, }) } diff --git a/lychee-lib/src/extract/markdown.rs b/lychee-lib/src/extract/markdown.rs index 4b3720206f..126d9faab4 100644 --- a/lychee-lib/src/extract/markdown.rs +++ b/lychee-lib/src/extract/markdown.rs @@ -1,9 +1,11 @@ //! Extract links and fragments from markdown documents use std::collections::{HashMap, HashSet}; +use log::warn; use pulldown_cmark::{CowStr, Event, LinkType, Options, Parser, Tag, TagEnd, TextMergeWithOffset}; use crate::{ + checker::wikilink::wikilink, extract::{html::html5gum::extract_html_with_span, plaintext::extract_raw_uri_from_plaintext}, types::uri::raw::{ OffsetSpanProvider, RawUri, RawUriSpan, SourceSpanProvider, SpanProvider as _, @@ -86,7 +88,7 @@ pub(crate) fn extract_markdown( Some(extract_raw_uri_from_plaintext(&dest_url, &span_provider)) } // Wiki URL (`[[http://example.com]]`) - LinkType::WikiLink { has_pothole: _ } => { + LinkType::WikiLink { has_pothole } => { // Exclude WikiLinks if not explicitly enabled if !include_wikilinks { return None; @@ -97,8 +99,18 @@ pub(crate) fn extract_markdown( return None; } - // wiki links start with `[[`, so offset the span by `2` - Some(raw_uri(&dest_url, span_provider.span(span.start + 2))) + if let Ok(wikilink) = wikilink(&dest_url, has_pothole) { + Some(vec![RawUri { + text: wikilink.to_string(), + element: Some("a".to_string()), + attribute: Some("wikilink".to_string()), + // wiki links start with `[[`, so offset the span by `2` + span: span_provider.span(span.start + 2) + }]) + } else { + warn!("The wikilink destination url {dest_url} could not be cleaned by removing potholes and fragments"); + None + } } } } @@ -530,7 +542,7 @@ $$ let expected = vec![RawUri { text: "https://example.com/destination".to_string(), element: Some("a".to_string()), - attribute: Some("href".to_string()), + attribute: Some("wikilink".to_string()), span: span(1, 3), }]; let uris = extract_markdown(markdown, true, true); @@ -544,13 +556,13 @@ $$ RawUri { text: "https://example.com/destination".to_string(), element: Some("a".to_string()), - attribute: Some("href".to_string()), + attribute: Some("wikilink".to_string()), span: span(1, 3), }, RawUri { text: "https://example.com/source".to_string(), element: Some("a".to_string()), - attribute: Some("href".to_string()), + attribute: Some("wikilink".to_string()), span: span(1, 38), }, ]; @@ -679,6 +691,37 @@ Shortcut link: [link4] } } + #[test] + fn test_clean_wikilink() { + let markdown = r" +[[foo|bar]] +[[foo#bar]] +[[foo#bar|baz]] +"; + let uris = extract_markdown(markdown, true, true); + let expected = vec![ + RawUri { + text: "foo".to_string(), + element: Some("a".to_string()), + attribute: Some("wikilink".to_string()), + span: span(2, 3), + }, + RawUri { + text: "foo".to_string(), + element: Some("a".to_string()), + attribute: Some("wikilink".to_string()), + span: span(3, 3), + }, + RawUri { + text: "foo".to_string(), + element: Some("a".to_string()), + attribute: Some("wikilink".to_string()), + span: span(4, 3), + }, + ]; + assert_eq!(uris, expected); + } + #[test] fn test_nested_html() { let input = r#" @@ -699,6 +742,18 @@ Shortcut link: [link4] assert_eq!(uris, expected); } + #[test] + fn test_wikilink_extraction_returns_none_on_empty_links() { + let markdown = r" +[[|bar]] +[[#bar]] +[[#bar|baz]] +"; + + let uris = extract_markdown(markdown, true, true); + assert!(uris.is_empty()); + } + #[test] fn test_mdx_multiline_jsx() { let input = r#" @@ -758,4 +813,17 @@ Shortcut link: [link4] assert_eq!(uri.attribute, Some("href".to_string())); } } + + #[test] + fn test_remove_wikilink_potholes_and_fragments() { + let markdown = r"[[foo#bar|baz]]"; + let uris = extract_markdown(markdown, true, true); + let expected = vec![RawUri { + text: "foo".to_string(), + element: Some("a".to_string()), + attribute: Some("wikilink".to_string()), + span: span(1, 3), + }]; + assert_eq!(uris, expected); + } } diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index a351c4dbbf..8b70791af9 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -178,6 +178,14 @@ pub enum ErrorKind { /// The reason the command failed reason: String, }, + + /// The extracted `WikiLink` could not be found by searching the directory + #[error("Wikilink {0} not found at {1}")] + WikilinkNotFound(Uri, PathBuf), + + /// Error on creation of the `WikilinkResolver` + #[error("Failed to initialize wikilink checker: {0}")] + WikilinkInvalidBase(String), } impl ErrorKind { @@ -335,7 +343,13 @@ impl ErrorKind { [name] => format!("An index file ({name}) is required"), [init @ .., tail] => format!("An index file ({}, or {}) is required", init.join(", "), tail), }.into(), - ErrorKind::PreprocessorError{command, reason} => Some(format!("Command '{command}' failed {reason}. Check value of the preprocessor option")), + ErrorKind::PreprocessorError{command, reason} => Some(format!("Command '{command}' failed {reason}. Check value of the pre option")), + ErrorKind::WikilinkNotFound(uri, pathbuf) => Some(format!( + "WikiLink {uri} could not be found at {:}", pathbuf.display() + )), + ErrorKind::WikilinkInvalidBase(reason) => Some(format!( + "WikiLink Resolver could not be created: {reason} ", + )), } } @@ -466,6 +480,8 @@ impl Hash for ErrorKind { Self::Cookies(e) => e.hash(state), Self::StatusCodeSelectorError(e) => e.to_string().hash(state), Self::PreprocessorError { command, reason } => (command, reason).hash(state), + Self::WikilinkNotFound(uri, pathbuf) => (uri, pathbuf).hash(state), + Self::WikilinkInvalidBase(e) => e.hash(state), } } }