diff --git a/lychee-lib/src/extract/markdown.rs b/lychee-lib/src/extract/markdown.rs index 51099097c1..c8f7214b56 100644 --- a/lychee-lib/src/extract/markdown.rs +++ b/lychee-lib/src/extract/markdown.rs @@ -23,6 +23,7 @@ pub(crate) fn extract_markdown( // which is why we keep track of entries and exits while traversing the input. let mut inside_code_block = false; let mut inside_link_block = false; + let mut inside_wikilink_block = false; let parser = TextMergeStream::new(Parser::new_ext(input, md_extensions())); parser @@ -40,6 +41,7 @@ pub(crate) fn extract_markdown( // Inline link like `[foo](bar)` // This is the most common link type LinkType::Inline => { + inside_link_block = true; Some(vec![RawUri { text: dest_url.to_string(), // Emulate `` tag here to be compatible with @@ -60,7 +62,10 @@ pub(crate) fn extract_markdown( // Shortcut link like `[foo]` LinkType::Shortcut | // Shortcut without destination in the document, but resolved by the `broken_link_callback` - LinkType::ShortcutUnknown | + LinkType::ShortcutUnknown => { + inside_link_block = true; + Some(extract_raw_uri_from_plaintext(&dest_url)) + }, // Autolink like `` LinkType::Autolink | // Email address in autolink like `` @@ -72,7 +77,7 @@ pub(crate) fn extract_markdown( if !include_wikilinks { return None; } - inside_link_block = true; + inside_wikilink_block = true; //Ignore gitlab toc notation: https://docs.gitlab.com/user/markdown/#table-of-contents if ["_TOC_".to_string(), "TOC".to_string()].contains(&dest_url.to_string()) { return None; @@ -111,7 +116,9 @@ pub(crate) fn extract_markdown( // A text node. Event::Text(txt) => { - if (inside_code_block && !include_verbatim) || inside_link_block { + if inside_wikilink_block + || (inside_link_block && !include_verbatim) + || (inside_code_block && !include_verbatim) { None } else { Some(extract_raw_uri_from_plaintext(&txt)) @@ -137,6 +144,7 @@ pub(crate) fn extract_markdown( // A detected link block. Event::End(TagEnd::Link) => { inside_link_block = false; + inside_wikilink_block = false; None } @@ -439,4 +447,61 @@ $$ let uris = extract_markdown(markdown, true, true); assert!(uris.is_empty()); } + + #[test] + fn test_link_text_not_checked() { + // Test that link text is not extracted as a separate link by default + let markdown = + r"[https://lycheerepublic.gov/notexist (archive.org link)](https://example.com)"; + let uris = extract_markdown(markdown, false, false); + + // Should only extract the destination URL, not the link text + let expected = vec![RawUri { + text: "https://example.com".to_string(), + element: Some("a".to_string()), + attribute: Some("href".to_string()), + }]; + + assert_eq!(uris, expected); + assert_eq!( + uris.len(), + 1, + "Should only find destination URL, not link text" + ); + } + + #[test] + fn test_link_text_checked_with_include_verbatim() { + // Test that link text IS extracted when include_verbatim is true + let markdown = + r"[https://lycheerepublic.gov/notexist (archive.org link)](https://example.com)"; + let uris = extract_markdown(markdown, true, false); + + // Should extract both the link text AND the destination URL + let expected = vec![ + RawUri { + text: "https://example.com".to_string(), + element: Some("a".to_string()), + attribute: Some("href".to_string()), + }, + RawUri { + text: "https://lycheerepublic.gov/notexist".to_string(), + element: None, + attribute: None, + }, + ]; + + assert_eq!( + uris.len(), + 2, + "Should find both destination URL and link text" + ); + // Check that both expected URLs are present (order might vary) + for expected_uri in expected { + assert!( + uris.contains(&expected_uri), + "Missing expected URI: {expected_uri:?}" + ); + } + } }