Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 68 additions & 3 deletions lychee-lib/src/extract/markdown.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ pub(crate) fn extract_markdown(
// which is why we keep track of entries and exits while traversing the input.
let mut inside_code_block = false;
let mut inside_link_block = false;
let mut inside_wikilink_block = false;

let parser = TextMergeStream::new(Parser::new_ext(input, md_extensions()));
parser
Expand All @@ -40,6 +41,7 @@ pub(crate) fn extract_markdown(
// Inline link like `[foo](bar)`
// This is the most common link type
LinkType::Inline => {
inside_link_block = true;
Some(vec![RawUri {
text: dest_url.to_string(),
// Emulate `<a href="...">` tag here to be compatible with
Expand All @@ -60,7 +62,10 @@ pub(crate) fn extract_markdown(
// Shortcut link like `[foo]`
LinkType::Shortcut |
// Shortcut without destination in the document, but resolved by the `broken_link_callback`
LinkType::ShortcutUnknown |
LinkType::ShortcutUnknown => {
inside_link_block = true;
Some(extract_raw_uri_from_plaintext(&dest_url))
},
// Autolink like `<http://foo.bar/baz>`
LinkType::Autolink |
// Email address in autolink like `<john@example.org>`
Expand All @@ -72,7 +77,7 @@ pub(crate) fn extract_markdown(
if !include_wikilinks {
return None;
}
inside_link_block = true;
inside_wikilink_block = true;
//Ignore gitlab toc notation: https://docs.gitlab.com/user/markdown/#table-of-contents
if ["_TOC_".to_string(), "TOC".to_string()].contains(&dest_url.to_string()) {
return None;
Expand Down Expand Up @@ -111,7 +116,9 @@ pub(crate) fn extract_markdown(

// A text node.
Event::Text(txt) => {
if (inside_code_block && !include_verbatim) || inside_link_block {
if inside_wikilink_block
|| (inside_link_block && !include_verbatim)
|| (inside_code_block && !include_verbatim) {
None
} else {
Some(extract_raw_uri_from_plaintext(&txt))
Expand All @@ -137,6 +144,7 @@ pub(crate) fn extract_markdown(
// A detected link block.
Event::End(TagEnd::Link) => {
inside_link_block = false;
inside_wikilink_block = false;
None
}

Expand Down Expand Up @@ -439,4 +447,61 @@ $$
let uris = extract_markdown(markdown, true, true);
assert!(uris.is_empty());
}

#[test]
fn test_link_text_not_checked() {
// Test that link text is not extracted as a separate link by default
let markdown =
r"[https://lycheerepublic.gov/notexist (archive.org link)](https://example.com)";
let uris = extract_markdown(markdown, false, false);

// Should only extract the destination URL, not the link text
let expected = vec![RawUri {
text: "https://example.com".to_string(),
element: Some("a".to_string()),
attribute: Some("href".to_string()),
}];

assert_eq!(uris, expected);
assert_eq!(
uris.len(),
1,
"Should only find destination URL, not link text"
);
}

#[test]
fn test_link_text_checked_with_include_verbatim() {
// Test that link text IS extracted when include_verbatim is true
let markdown =
r"[https://lycheerepublic.gov/notexist (archive.org link)](https://example.com)";
let uris = extract_markdown(markdown, true, false);

// Should extract both the link text AND the destination URL
let expected = vec![
RawUri {
text: "https://example.com".to_string(),
element: Some("a".to_string()),
attribute: Some("href".to_string()),
},
RawUri {
text: "https://lycheerepublic.gov/notexist".to_string(),
element: None,
attribute: None,
},
];

assert_eq!(
uris.len(),
2,
"Should find both destination URL and link text"
);
// Check that both expected URLs are present (order might vary)
for expected_uri in expected {
assert!(
uris.contains(&expected_uri),
"Missing expected URI: {expected_uri:?}"
);
}
}
}
Loading