From fc8ccf0ed1f0f00d2c1d8bcf1edcaecbec0ac2e7 Mon Sep 17 00:00:00 2001 From: Matthias Date: Tue, 11 Nov 2025 12:11:47 +0100 Subject: [PATCH] Fix extracting links after `
` Fixes https://github.com/lycheeverse/lychee/issues/1905 The problem was that the EndTag event handler was checking if `last_verbatim == &self.current_element`, but `self.current_element` was never updated during EndTag events - it only got set during OpenStartTag events. This meant when `` was processed, it couldn't match against the verbatim stack to pop it, leaving the verbatim flag incorrectly active for subsequent content. --- lychee-lib/src/extract/html/html5ever.rs | 36 ++++++++++++++++++++ lychee-lib/src/extract/html/html5gum.rs | 43 ++++++++++++++++++++++-- 2 files changed, 76 insertions(+), 3 deletions(-) diff --git a/lychee-lib/src/extract/html/html5ever.rs b/lychee-lib/src/extract/html/html5ever.rs index 6bfb8d40ca..fae25d6161 100644 --- a/lychee-lib/src/extract/html/html5ever.rs +++ b/lychee-lib/src/extract/html/html5ever.rs @@ -583,4 +583,40 @@ mod tests { let uris = extract_html(input, false); assert!(uris.is_empty()); } + + #[test] + fn test_extract_links_after_empty_verbatim_block() { + // Test that links are correctly extracted after empty
 blocks
+        let input = r#"
+        
+            
+ See First +
+
+                
+            
+
+ See Second +
+ + "#; + + let expected = vec![ + RawUri { + text: "https://example.com/1".to_string(), + element: Some("a".to_string()), + attribute: Some("href".to_string()), + span: span_line(4), + }, + RawUri { + text: "https://example.com/2".to_string(), + element: Some("a".to_string()), + attribute: Some("href".to_string()), + span: span_line(10), + }, + ]; + + let uris = extract_html(input, false); + assert_eq!(uris, expected); + } } diff --git a/lychee-lib/src/extract/html/html5gum.rs b/lychee-lib/src/extract/html/html5gum.rs index ad6f4ee4d3..c7b58c135f 100644 --- a/lychee-lib/src/extract/html/html5gum.rs +++ b/lychee-lib/src/extract/html/html5gum.rs @@ -278,14 +278,15 @@ impl Callback<(), usize> for &mut LinkExtractor { self.verbatim_stack.pop(); } } - CallbackEvent::EndTag { .. } => { + CallbackEvent::EndTag { name } => { + let tag_name = String::from_utf8_lossy(name); // Update the current verbatim element name. // // Keeps track of the last verbatim element name, so that we can // properly handle nested verbatim blocks. - if self.filter_verbatim_here() + if !self.include_verbatim && let Some(last_verbatim) = self.verbatim_stack.last() - && last_verbatim == &self.current_element + && last_verbatim == tag_name.as_ref() { self.verbatim_stack.pop(); } @@ -726,4 +727,40 @@ mod tests { let actual = extract_html_fragments(input); assert_eq!(actual, expected); } + + #[test] + fn test_extract_links_after_empty_verbatim_block() { + // Test that links are correctly extracted after empty
 blocks
+        let input = r#"
+        
+            
+ See First +
+
+                
+            
+
+ See Second +
+ + "#; + + let expected = vec![ + RawUri { + text: "https://example.com/1".to_string(), + element: Some("a".to_string()), + attribute: Some("href".to_string()), + span: span(4, 30), + }, + RawUri { + text: "https://example.com/2".to_string(), + element: Some("a".to_string()), + attribute: Some("href".to_string()), + span: span(10, 30), + }, + ]; + + let uris = extract_html(input, false); + assert_eq!(uris, expected); + } }