diff --git a/Cargo.lock b/Cargo.lock index 57c716960f..0f8552e94d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1821,9 +1821,9 @@ dependencies = [ [[package]] name = "html5gum" -version = "0.7.0" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3918b5f36d61861b757261da986b51be562c7a87ac4e531d4158e67e08bff72" +checksum = "ba6fbe46e93059ce8ee19fbefdb0c7699cc7197fcaac048f2c3593f3e5da845f" dependencies = [ "jetscii", ] diff --git a/lychee-lib/Cargo.toml b/lychee-lib/Cargo.toml index 61587bc97c..7959ace62e 100644 --- a/lychee-lib/Cargo.toml +++ b/lychee-lib/Cargo.toml @@ -24,7 +24,7 @@ futures = "0.3.31" glob = "0.3.2" headers = "0.4.0" html5ever = "0.31.0" -html5gum = "0.7.0" +html5gum = "0.8.0" http = "1.3.1" hyper = "1.6.0" ignore = "0.4.23" diff --git a/lychee-lib/src/extract/html/html5ever.rs b/lychee-lib/src/extract/html/html5ever.rs index d7fa67bbf1..316f1df5ad 100644 --- a/lychee-lib/src/extract/html/html5ever.rs +++ b/lychee-lib/src/extract/html/html5ever.rs @@ -9,9 +9,33 @@ use html5ever::{ use super::{ super::plaintext::extract_raw_uri_from_plaintext, is_email_link, is_verbatim_elem, srcset, }; -use crate::types::uri::raw::RawUri; +use crate::types::uri::raw::{RawUri, RawUriSpan, SourceSpanProvider, SpanProvider}; + +/// A [`SpanProvider`] which applies a given line offset. +struct LineOffsetSpanProvider<'a> { + /// The number of lines each span will be offset by. + lines_before: usize, + /// The inner [`SpanProvider`] which will be responsible for computing the spans. + inner: &'a SourceSpanProvider<'a>, +} -#[derive(Clone, Default)] +impl SpanProvider for LineOffsetSpanProvider<'_> { + fn span(&self, offset: usize) -> RawUriSpan { + let mut span = self.inner.span(offset); + // if we stay in the same line the column information is wrong, since we didn't know the + // column beforehand and likely did not start at a linebreak. + // This can be improved in the future by using the computed length of lines. + if span.line.get() == 1 { + span.column = None; + } + span.line = span + .line + .saturating_add(self.lines_before.saturating_sub(1)); + span + } +} + +#[derive(Clone)] struct LinkExtractor { links: RefCell>, include_verbatim: bool, @@ -22,7 +46,8 @@ impl TokenSink for LinkExtractor { type Handle = (); #[allow(clippy::match_same_arms)] - fn process_token(&self, token: Token, _line_number: u64) -> TokenSinkResult<()> { + fn process_token(&self, token: Token, line_number: u64) -> TokenSinkResult<()> { + debug_assert_ne!(line_number, 0); match token { Token::CharacterTokens(raw) => { if self.current_verbatim_element_name.borrow().is_some() { @@ -31,122 +56,16 @@ impl TokenSink for LinkExtractor { if self.include_verbatim { self.links .borrow_mut() - .extend(extract_raw_uri_from_plaintext(&raw)); - } - } - Token::TagToken(tag) => { - let Tag { - kind, - name, - self_closing: _self_closing, - attrs, - } = tag; - // Check if this is a verbatim element, which we want to skip. - if !self.include_verbatim && is_verbatim_elem(&name) { - // Check if we're currently inside a verbatim block - let mut curr_verbatim_elem = self.current_verbatim_element_name.borrow_mut(); - - if curr_verbatim_elem.is_some() { - // Inside a verbatim block. Check if the verbatim - // element name matches with the current element name. - if curr_verbatim_elem.as_ref() == Some(&name.to_string()) { - // If so, we're done with the verbatim block, - // -- but only if this is an end tag. - if matches!(kind, TagKind::EndTag) { - *curr_verbatim_elem = None; - } - } - } else if matches!(kind, TagKind::StartTag) { - // We're not inside a verbatim block, but we just - // encountered a verbatim element. Remember the name - // of the element. - *curr_verbatim_elem = Some(name.to_string()); - } - } - if self.current_verbatim_element_name.borrow().is_some() { - // We want to skip the content of this element - // as we're inside a verbatim block. - return TokenSinkResult::Continue; - } - - // Check for rel=nofollow. We only extract the first `rel` attribute. - // This is correct as per https://html.spec.whatwg.org/multipage/syntax.html#attributes-0, which states - // "There must never be two or more attributes on the same start tag whose names are an ASCII case-insensitive match for each other." - if let Some(rel) = attrs.iter().find(|attr| &attr.name.local == "rel") { - if rel.value.contains("nofollow") { - return TokenSinkResult::Continue; - } - } - - // Check and exclude `rel=preconnect` and `rel=dns-prefetch`. Unlike `prefetch` and `preload`, - // `preconnect` and `dns-prefetch` only perform DNS lookups and do not necessarily link to a resource - if let Some(rel) = attrs.iter().find(|attr| &attr.name.local == "rel") { - if rel.value.contains("preconnect") || rel.value.contains("dns-prefetch") { - return TokenSinkResult::Continue; - } - } - - // Check and exclude `prefix` attribute. This attribute is used to define a prefix - // for the current element. It is not used to link to a resource. - if let Some(_prefix) = attrs.iter().find(|attr| &attr.name.local == "prefix") { - return TokenSinkResult::Continue; - } - - for attr in &attrs { - let urls = LinkExtractor::extract_urls_from_elem_attr( - &attr.name.local, - &name, - &attr.value, - ); - - let new_urls = match urls { - None => extract_raw_uri_from_plaintext(&attr.value), - Some(urls) => urls - .into_iter() - .filter(|url| { - // Only accept email addresses which - // - occur in `href` attributes - // - start with `mailto:` - // - // Technically, email addresses could - // also occur in plain text, but we don't want to extract those - // because of the high false positive rate. - // - // This ignores links like `` - let is_email = is_email_link(url); - let is_mailto = url.starts_with("mailto:"); - let is_phone = url.starts_with("tel:"); - let is_href = attr.name.local.as_ref() == "href"; - - if attrs.iter().any(|attr| { - &attr.name.local == "rel" && attr.value.contains("stylesheet") - }) { - // Skip virtual/framework-specific stylesheet paths that start with /@ or @ - // These are typically resolved by dev servers or build tools rather than being real URLs - // Examples: /@global/style.css, @tailwind/base.css as in - // `` - if url.starts_with("/@") || url.starts_with('@') { - return false; - } - // Skip disabled stylesheets - // Ref: https://developer.mozilla.org/en-US/docs/Web/API/HTMLLinkElement/disabled - if attrs.iter().any(|attr| &attr.name.local == "disabled") { - return false; - } - } - - !is_email || (is_mailto && is_href) || (is_phone && is_href) - }) - .map(|url| RawUri { - text: url.to_string(), - element: Some(name.to_string()), - attribute: Some(attr.name.local.to_string()), - }) - .collect::>(), - }; - self.links.borrow_mut().extend(new_urls); + .extend(extract_raw_uri_from_plaintext( + &raw, + &LineOffsetSpanProvider { + lines_before: line_number.try_into().unwrap(), + inner: &SourceSpanProvider::from_input(&raw), + }, + )); } } + Token::TagToken(tag) => return self.process_tag(tag, line_number), Token::ParseError(_err) => { // Silently ignore parse errors } @@ -168,6 +87,134 @@ impl LinkExtractor { } } + fn process_tag( + &self, + Tag { + kind, + name, + self_closing: _, + attrs, + }: Tag, + line_number: u64, + ) -> TokenSinkResult<()> { + // Check if this is a verbatim element, which we want to skip. + if !self.include_verbatim && is_verbatim_elem(&name) { + // Check if we're currently inside a verbatim block + let mut curr_verbatim_elem = self.current_verbatim_element_name.borrow_mut(); + + if curr_verbatim_elem.is_some() { + // Inside a verbatim block. Check if the verbatim + // element name matches with the current element name. + if curr_verbatim_elem.as_ref() == Some(&name.to_string()) { + // If so, we're done with the verbatim block, + // -- but only if this is an end tag. + if matches!(kind, TagKind::EndTag) { + *curr_verbatim_elem = None; + } + } + } else if matches!(kind, TagKind::StartTag) { + // We're not inside a verbatim block, but we just + // encountered a verbatim element. Remember the name + // of the element. + *curr_verbatim_elem = Some(name.to_string()); + } + } + if self.current_verbatim_element_name.borrow().is_some() { + // We want to skip the content of this element + // as we're inside a verbatim block. + return TokenSinkResult::Continue; + } + + // Check for rel=nofollow. We only extract the first `rel` attribute. + // This is correct as per https://html.spec.whatwg.org/multipage/syntax.html#attributes-0, which states + // "There must never be two or more attributes on the same start tag whose names are an ASCII case-insensitive match for each other." + if let Some(rel) = attrs.iter().find(|attr| &attr.name.local == "rel") { + if rel.value.contains("nofollow") { + return TokenSinkResult::Continue; + } + } + + // Check and exclude `rel=preconnect` and `rel=dns-prefetch`. Unlike `prefetch` and `preload`, + // `preconnect` and `dns-prefetch` only perform DNS lookups and do not necessarily link to a resource + if let Some(rel) = attrs.iter().find(|attr| &attr.name.local == "rel") { + if rel.value.contains("preconnect") || rel.value.contains("dns-prefetch") { + return TokenSinkResult::Continue; + } + } + + // Check and exclude `prefix` attribute. This attribute is used to define a prefix + // for the current element. It is not used to link to a resource. + if let Some(_prefix) = attrs.iter().find(|attr| &attr.name.local == "prefix") { + return TokenSinkResult::Continue; + } + + for attr in &attrs { + let urls = + LinkExtractor::extract_urls_from_elem_attr(&attr.name.local, &name, &attr.value); + + let new_urls = match urls { + None => extract_raw_uri_from_plaintext( + &attr.value, + &LineOffsetSpanProvider { + lines_before: line_number.try_into().unwrap(), + inner: &SourceSpanProvider::from_input(&attr.value), + }, + ), + Some(urls) => urls + .into_iter() + .filter(|url| { + // Only accept email addresses which + // - occur in `href` attributes + // - start with `mailto:` + // + // Technically, email addresses could + // also occur in plain text, but we don't want to extract those + // because of the high false positive rate. + // + // This ignores links like `` + let is_email = is_email_link(url); + let is_mailto = url.starts_with("mailto:"); + let is_phone = url.starts_with("tel:"); + let is_href = attr.name.local.as_ref() == "href"; + + if attrs.iter().any(|attr| { + &attr.name.local == "rel" && attr.value.contains("stylesheet") + }) { + // Skip virtual/framework-specific stylesheet paths that start with /@ or @ + // These are typically resolved by dev servers or build tools rather than being real URLs + // Examples: /@global/style.css, @tailwind/base.css as in + // `` + if url.starts_with("/@") || url.starts_with('@') { + return false; + } + // Skip disabled stylesheets + // Ref: https://developer.mozilla.org/en-US/docs/Web/API/HTMLLinkElement/disabled + if attrs.iter().any(|attr| &attr.name.local == "disabled") { + return false; + } + } + + !is_email || (is_mailto && is_href) || (is_phone && is_href) + }) + .map(|url| RawUri { + text: url.to_string(), + element: Some(name.to_string()), + attribute: Some(attr.name.local.to_string()), + span: RawUriSpan { + line: usize::try_from(line_number) + .unwrap() + .try_into() + .expect("checked above that `line_number != 0`"), + column: None, + }, + }) + .collect::>(), + }; + self.links.borrow_mut().extend(new_urls); + } + TokenSinkResult::Continue + } + /// Extract all semantically known links from a given HTML attribute. #[allow(clippy::unnested_or_patterns)] pub(crate) fn extract_urls_from_elem_attr<'a>( @@ -242,12 +289,21 @@ mod tests { "#; + /// Small test helper to create a [`RawUriSpan`] from just the line and leave the column unset. + const fn span(line: usize) -> RawUriSpan { + RawUriSpan { + line: std::num::NonZeroUsize::new(line).unwrap(), + column: None, + } + } + #[test] fn test_skip_verbatim() { let expected = vec![RawUri { text: "https://example.org".to_string(), element: Some("a".to_string()), attribute: Some("href".to_string()), + span: span(4), }]; let uris = extract_html(HTML_INPUT, false); @@ -261,26 +317,31 @@ mod tests { text: "https://example.com".to_string(), element: None, attribute: None, + span: span(4), }, RawUri { text: "https://example.org".to_string(), element: Some("a".to_string()), attribute: Some("href".to_string()), + span: span(4), }, RawUri { text: "https://foo.com".to_string(), element: None, attribute: None, + span: span(7), }, RawUri { text: "http://bar.com/some/path".to_string(), element: None, attribute: None, + span: span(7), }, RawUri { text: "https://baz.org".to_string(), element: Some("a".to_string()), attribute: Some("href".to_string()), + span: span(9), }, ]; @@ -303,6 +364,7 @@ mod tests { text: "https://example.com/".to_string(), element: Some("a".to_string()), attribute: Some("href".to_string()), + span: span(2), }]; let uris = extract_html(HTML_INPUT, false); @@ -320,6 +382,7 @@ mod tests { text: "https://example.org".to_string(), element: Some("a".to_string()), attribute: Some("href".to_string()), + span: span(4), }]; let uris = extract_html(input, false); assert_eq!(uris, expected); @@ -337,6 +400,7 @@ mod tests { text: "https://example.org".to_string(), element: Some("a".to_string()), attribute: Some("href".to_string()), + span: span(5), }]; let uris = extract_html(input, false); assert_eq!(uris, expected); @@ -353,6 +417,7 @@ mod tests { text: "https://example.org".to_string(), element: Some("a".to_string()), attribute: Some("href".to_string()), + span: span(4), }]; let uris = extract_html(input, false); assert_eq!(uris, expected); @@ -375,6 +440,7 @@ mod tests { text: "mailto:foo@bar.com".to_string(), element: Some("a".to_string()), attribute: Some("href".to_string()), + span: span(8), }]; let uris = extract_html(input, false); assert_eq!(uris, expected); @@ -397,6 +463,7 @@ mod tests { text: "tel:1234567890".to_string(), element: Some("a".to_string()), attribute: Some("href".to_string()), + span: span(8), }]; let uris = extract_html(input, false); assert_eq!(uris, expected); @@ -477,6 +544,7 @@ mod tests { text: "https://example.com".to_string(), element: Some("a".to_string()), attribute: Some("href".to_string()), + span: span(2), }]; let uris = extract_html(input, false); diff --git a/lychee-lib/src/extract/html/html5gum.rs b/lychee-lib/src/extract/html/html5gum.rs index c61a9aa321..2bee5345f3 100644 --- a/lychee-lib/src/extract/html/html5gum.rs +++ b/lychee-lib/src/extract/html/html5gum.rs @@ -1,17 +1,14 @@ -use html5gum::{Emitter, Error, State, Tokenizer}; +use html5gum::{ + Spanned, Tokenizer, + emitters::callback::{Callback, CallbackEmitter, CallbackEvent}, +}; use std::collections::{HashMap, HashSet}; use super::{is_email_link, is_verbatim_elem, srcset}; -use crate::{extract::plaintext::extract_raw_uri_from_plaintext, types::uri::raw::RawUri}; - -#[derive(Clone, Default, Debug)] -struct Element { - /// Current element name being processed. - /// This is called a tag in html5gum. - name: String, - /// Whether the current element is a closing tag. - is_closing: bool, -} +use crate::{ + extract::plaintext::extract_raw_uri_from_plaintext, + types::uri::raw::{OffsetSpanProvider, RawUri, SourceSpanProvider, SpanProvider}, +}; /// Extract links from HTML documents. /// @@ -25,38 +22,48 @@ struct Element { /// /// The `links` vector contains all links extracted from the HTML document and /// the `fragments` set contains all fragments extracted from the HTML document. -#[derive(Clone, Default, Debug)] -struct LinkExtractor { +#[derive(Clone, Debug)] +struct LinkExtractor { + /// The [`SpanProvider`] which will be used to compute spans for URIs. + /// + /// This is generic, since e.g. the markdown parser has already started, so we have to compute + /// the span location in relation to the offset in the outer document. + span_provider: S, /// Links extracted from the HTML document. links: Vec, /// Fragments extracted from the HTML document. fragments: HashSet, /// Whether to include verbatim elements in the output. include_verbatim: bool, - /// Current element being processed. - current_element: Element, + /// Current element name being processed. + /// This is called a tag in html5gum. + current_element: String, /// Current attributes being processed. /// This is a list of key-value pairs (in order of appearance), where the key is the attribute name /// and the value is the attribute value. - current_attributes: HashMap, + current_attributes: HashMap>, /// Current attribute name being processed. current_attribute_name: String, - /// A bunch of plain characters currently being processed. - current_raw_string: String, /// Element name of the current verbatim block. /// Used to keep track of nested verbatim blocks. verbatim_stack: Vec, } -impl LinkExtractor { +impl LinkExtractor { /// Create a new `LinkExtractor`. /// /// Set `include_verbatim` to `true` if you want to include verbatim /// elements in the output. - fn new(include_verbatim: bool) -> Self { + fn new(span_provider: S, include_verbatim: bool) -> Self { Self { + span_provider, include_verbatim, - ..Default::default() + links: Vec::default(), + fragments: HashSet::default(), + current_element: String::default(), + current_attributes: HashMap::default(), + current_attribute_name: String::default(), + verbatim_stack: Vec::default(), } } @@ -69,17 +76,19 @@ impl LinkExtractor { // Process 'srcset' attribute first if let Some(srcset) = self.current_attributes.get("srcset") { + let span = srcset.span; urls.extend(srcset::parse(srcset).into_iter().map(|url| RawUri { text: url.to_string(), - element: Some(self.current_element.name.clone()), + element: Some(self.current_element.clone()), attribute: Some("srcset".to_string()), + span: self.span_provider.span(span.start), })); } // Process other attributes for (attr_name, attr_value) in &self.current_attributes { #[allow(clippy::unnested_or_patterns)] - match (self.current_element.name.as_str(), attr_name.as_str()) { + match (self.current_element.as_str(), attr_name.as_str()) { // Common element/attribute combinations for links (_, "href" | "src" | "cite" | "usemap") | // Less common (but still valid!) combinations @@ -98,8 +107,9 @@ impl LinkExtractor { ("video", "poster") => { urls.push(RawUri { text: attr_value.to_string(), - element: Some(self.current_element.name.clone()), + element: Some(self.current_element.clone()), attribute: Some(attr_name.to_string()), + span: self.span_provider.span(attr_value.span.start), }); } _ => {} @@ -109,37 +119,9 @@ impl LinkExtractor { urls } - /// Extract links from the current string and add them to the links vector. - fn flush_current_characters(&mut self) { - if !self.include_verbatim - && (is_verbatim_elem(&self.current_element.name) || !self.verbatim_stack.is_empty()) - { - self.update_verbatim_element(); - // Early return since we don't want to extract links from verbatim - // blocks according to the configuration. - self.current_raw_string.clear(); - return; - } - - self.links - .extend(extract_raw_uri_from_plaintext(&self.current_raw_string)); - self.current_raw_string.clear(); - } - - /// Update the current verbatim element name. - /// - /// Keeps track of the last verbatim element name, so that we can - /// properly handle nested verbatim blocks. - fn update_verbatim_element(&mut self) { - if self.current_element.is_closing { - if let Some(last_verbatim) = self.verbatim_stack.last() { - if last_verbatim == &self.current_element.name { - self.verbatim_stack.pop(); - } - } - } else if !self.include_verbatim && is_verbatim_elem(&self.current_element.name) { - self.verbatim_stack.push(self.current_element.name.clone()); - } + fn filter_verbatim_here(&self) -> bool { + !self.include_verbatim + && (is_verbatim_elem(&self.current_element) || !self.verbatim_stack.is_empty()) } /// Flush the current element and attribute values to the links vector. @@ -160,11 +142,7 @@ impl LinkExtractor { /// /// The current attribute name and value are cleared after processing. fn flush_links(&mut self) { - self.update_verbatim_element(); - - if !self.include_verbatim - && (!self.verbatim_stack.is_empty() || is_verbatim_elem(&self.current_element.name)) - { + if self.filter_verbatim_here() { self.current_attributes.clear(); return; } @@ -239,119 +217,97 @@ impl LinkExtractor { } } -impl Emitter for &mut LinkExtractor { - type Token = (); - - fn set_last_start_tag(&mut self, last_start_tag: Option<&[u8]>) { - self.current_element.name = - String::from_utf8_lossy(last_start_tag.unwrap_or_default()).into_owned(); - } - - fn emit_eof(&mut self) { - self.flush_current_characters(); - } - - fn emit_error(&mut self, _: Error) {} - - fn should_emit_errors(&mut self) -> bool { - false - } - - fn pop_token(&mut self) -> Option<()> { - None - } - - /// Emit a bunch of plain characters as character tokens. - fn emit_string(&mut self, c: &[u8]) { - self.current_raw_string - .push_str(&String::from_utf8_lossy(c)); - } - - fn init_start_tag(&mut self) { - self.flush_current_characters(); - self.current_element = Element::default(); - } - - fn init_end_tag(&mut self) { - self.flush_current_characters(); - self.current_element = Element { - name: String::new(), - is_closing: true, - }; - } - - fn init_comment(&mut self) { - self.flush_current_characters(); - } - - fn emit_current_tag(&mut self) -> Option { - self.flush_links(); +impl Callback<(), usize> for &mut LinkExtractor { + fn handle_event( + &mut self, + event: CallbackEvent<'_>, + span: html5gum::Span, + ) -> Option<()> { + match event { + CallbackEvent::OpenStartTag { name } => { + self.current_element = String::from_utf8_lossy(name).into_owned(); + + // Update the current verbatim element name. + // + // Keeps track of the last verbatim element name, so that we can + // properly handle nested verbatim blocks. + if self.filter_verbatim_here() && is_verbatim_elem(&self.current_element) { + self.verbatim_stack.push(self.current_element.clone()); + } + } + CallbackEvent::AttributeName { name } => { + self.current_attribute_name = String::from_utf8_lossy(name).into_owned(); + } + CallbackEvent::AttributeValue { value } => { + let value = String::from_utf8_lossy(value); + self.current_attributes + .entry(self.current_attribute_name.clone()) + .and_modify(|v| v.push_str(&value)) + .or_insert_with(|| Spanned { + value: value.into_owned(), + span, + }); + } + CallbackEvent::CloseStartTag { self_closing } => { + self.flush_links(); - if self.current_element.is_closing { - None - } else { - html5gum::naive_next_state(self.current_element.name.as_bytes()) + // Update the current verbatim element name. + // + // Keeps track of the last verbatim element name, so that we can + // properly handle nested verbatim blocks. + if self_closing && self.filter_verbatim_here() { + if let Some(last_verbatim) = self.verbatim_stack.last() { + if last_verbatim == &self.current_element { + self.verbatim_stack.pop(); + } + } + } + } + CallbackEvent::EndTag { .. } => { + // Update the current verbatim element name. + // + // Keeps track of the last verbatim element name, so that we can + // properly handle nested verbatim blocks. + if self.filter_verbatim_here() { + if let Some(last_verbatim) = self.verbatim_stack.last() { + if last_verbatim == &self.current_element { + self.verbatim_stack.pop(); + } + } + } + } + CallbackEvent::String { value } => { + if !self.filter_verbatim_here() { + // Extract links from the current string and add them to the links vector. + self.links.extend(extract_raw_uri_from_plaintext( + &String::from_utf8_lossy(value), + &OffsetSpanProvider { + offset: span.start, + inner: &self.span_provider, + }, + )); + } + } + CallbackEvent::Comment { .. } + | CallbackEvent::Doctype { .. } + | CallbackEvent::Error(_) => {} } + None } - - fn emit_current_doctype(&mut self) {} - - fn set_self_closing(&mut self) { - self.current_element.is_closing = true; - } - - fn set_force_quirks(&mut self) {} - - fn push_tag_name(&mut self, s: &[u8]) { - self.current_element - .name - .push_str(&String::from_utf8_lossy(s)); - } - - fn push_comment(&mut self, _: &[u8]) {} - - fn push_doctype_name(&mut self, _: &[u8]) {} - - fn init_doctype(&mut self) { - self.flush_current_characters(); - } - - fn init_attribute(&mut self) { - self.current_attribute_name.clear(); - } - - fn push_attribute_name(&mut self, s: &[u8]) { - self.current_attribute_name - .push_str(&String::from_utf8_lossy(s)); - } - - fn push_attribute_value(&mut self, s: &[u8]) { - let value = String::from_utf8_lossy(s); - self.current_attributes - .entry(self.current_attribute_name.clone()) - .and_modify(|v| v.push_str(&value)) - .or_insert_with(|| value.into_owned()); - } - - fn set_doctype_public_identifier(&mut self, _: &[u8]) {} - - fn set_doctype_system_identifier(&mut self, _: &[u8]) {} - - fn push_doctype_public_identifier(&mut self, _: &[u8]) {} - - fn push_doctype_system_identifier(&mut self, _: &[u8]) {} - - fn current_is_appropriate_end_tag_token(&mut self) -> bool { - self.current_element.is_closing && !self.current_element.name.is_empty() - } - - fn emit_current_comment(&mut self) {} } /// Extract unparsed URL strings from an HTML string. pub(crate) fn extract_html(buf: &str, include_verbatim: bool) -> Vec { - let mut extractor = LinkExtractor::new(include_verbatim); - let mut tokenizer = Tokenizer::new_with_emitter(buf, &mut extractor); + extract_html_with_span(buf, include_verbatim, SourceSpanProvider::from_input(buf)) +} + +pub(crate) fn extract_html_with_span( + buf: &str, + include_verbatim: bool, + span_provider: S, +) -> Vec { + let mut extractor = LinkExtractor::new(span_provider, include_verbatim); + let mut tokenizer = Tokenizer::new_with_emitter(buf, CallbackEmitter::new(&mut extractor)); assert!(tokenizer.next().is_none()); extractor .links @@ -362,14 +318,17 @@ pub(crate) fn extract_html(buf: &str, include_verbatim: bool) -> Vec { /// Extract fragments from id attributes within a HTML string. pub(crate) fn extract_html_fragments(buf: &str) -> HashSet { - let mut extractor = LinkExtractor::new(true); - let mut tokenizer = Tokenizer::new_with_emitter(buf, &mut extractor); + let span_provider = SourceSpanProvider::from_input(buf); + let mut extractor = LinkExtractor::new(span_provider, true); + let mut tokenizer = Tokenizer::new_with_emitter(buf, CallbackEmitter::new(&mut extractor)); assert!(tokenizer.next().is_none()); extractor.fragments } #[cfg(test)] mod tests { + use crate::types::uri::raw::span; + use super::*; const HTML_INPUT: &str = r#" @@ -403,6 +362,7 @@ mod tests { text: "https://example.org".to_string(), element: Some("a".to_string()), attribute: Some("href".to_string()), + span: span(4, 121), }]; let uris = extract_html(HTML_INPUT, false); @@ -416,26 +376,31 @@ mod tests { text: "https://example.com".to_string(), element: None, attribute: None, + span: span(4, 72), }, RawUri { text: "https://example.org".to_string(), element: Some("a".to_string()), attribute: Some("href".to_string()), + span: span(4, 121), }, RawUri { text: "https://foo.com".to_string(), element: None, attribute: None, + span: span(7, 9), }, RawUri { text: "http://bar.com/some/path".to_string(), element: None, attribute: None, + span: span(7, 29), }, RawUri { text: "https://baz.org".to_string(), element: Some("a".to_string()), attribute: Some("href".to_string()), + span: span(9, 18), }, ]; @@ -458,6 +423,7 @@ mod tests { text: "https://example.com/".to_string(), element: Some("a".to_string()), attribute: Some("href".to_string()), + span: span(2, 18), }]; let uris = extract_html(HTML_INPUT, false); @@ -489,6 +455,7 @@ mod tests { text: "https://example.org".to_string(), element: Some("a".to_string()), attribute: Some("href".to_string()), + span: span(4, 18), }]; let uris = extract_html(input, false); assert_eq!(uris, expected); @@ -515,6 +482,7 @@ mod tests { text: "https://example.org".to_string(), element: Some("a".to_string()), attribute: Some("href".to_string()), + span: span(5, 18), }]; let uris = extract_html(input, false); assert_eq!(uris, expected); @@ -531,6 +499,7 @@ mod tests { text: "https://example.org".to_string(), element: Some("a".to_string()), attribute: Some("href".to_string()), + span: span(4, 18), }]; let uris = extract_html(input, false); assert_eq!(uris, expected); @@ -553,6 +522,7 @@ mod tests { text: "tel:1234567890".to_string(), element: Some("a".to_string()), attribute: Some("href".to_string()), + span: span(8, 22), }]; let uris = extract_html(input, false); assert_eq!(uris, expected); @@ -575,6 +545,7 @@ mod tests { text: "mailto:foo@bar.com".to_string(), element: Some("a".to_string()), attribute: Some("href".to_string()), + span: span(8, 22), }]; let uris = extract_html(input, false); assert_eq!(uris, expected); @@ -614,16 +585,19 @@ mod tests { text: "/cdn-cgi/image/format=webp,width=640/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg".to_string(), element: Some("img".to_string()), attribute: Some("srcset".to_string()), + span: span(2, 26), }, RawUri { text: "/cdn-cgi/image/format=webp,width=750/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg".to_string(), element: Some("img".to_string()), attribute: Some("srcset".to_string()), + span: span(2, 26), }, RawUri { text: "/cdn-cgi/image/format=webp,width=3840/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg".to_string(), element: Some("img".to_string()), attribute: Some("src".to_string()), + span: span(2, 231), } ]; @@ -670,6 +644,7 @@ mod tests { text: "https://example.com".to_string(), element: Some("a".to_string()), attribute: Some("href".to_string()), + span: span(2, 22), }]; let uris = extract_html(input, false); diff --git a/lychee-lib/src/extract/markdown.rs b/lychee-lib/src/extract/markdown.rs index 41d6b515ff..6aeeef505f 100644 --- a/lychee-lib/src/extract/markdown.rs +++ b/lychee-lib/src/extract/markdown.rs @@ -1,11 +1,14 @@ //! Extract links and fragments from markdown documents use std::collections::{HashMap, HashSet}; -use pulldown_cmark::{CowStr, Event, LinkType, Options, Parser, Tag, TagEnd, TextMergeStream}; +use pulldown_cmark::{CowStr, Event, LinkType, Options, Parser, Tag, TagEnd, TextMergeWithOffset}; -use crate::{extract::plaintext::extract_raw_uri_from_plaintext, types::uri::raw::RawUri}; +use crate::{ + extract::{html::html5gum::extract_html_with_span, plaintext::extract_raw_uri_from_plaintext}, + types::uri::raw::{OffsetSpanProvider, RawUri, SourceSpanProvider, SpanProvider as _}, +}; -use super::html::html5gum::{extract_html, extract_html_fragments}; +use super::html::html5gum::extract_html_fragments; /// Returns the default markdown extensions used by lychee. /// Sadly, `|` is not const for `Options` so we can't use a const global. @@ -14,15 +17,18 @@ fn md_extensions() -> Options { } /// Extract unparsed URL strings from a Markdown string. +#[allow(clippy::too_many_lines)] pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec { // In some cases it is undesirable to extract links from within code blocks, // which is why we keep track of entries and exits while traversing the input. let mut inside_code_block = false; let mut inside_link_block = false; - let parser = TextMergeStream::new(Parser::new_ext(input, md_extensions())); + let span_provider = SourceSpanProvider::from_input(input); + let parser = + TextMergeWithOffset::new(Parser::new_ext(input, md_extensions()).into_offset_iter()); parser - .filter_map(|event| match event { + .filter_map(|(event, span)| match event { // A link. Event::Start(Tag::Link { link_type, @@ -43,6 +49,9 @@ pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec Vec` LinkType::Autolink | // Email address in autolink like `` - LinkType::Email => - Some(extract_raw_uri_from_plaintext(&dest_url)), + LinkType::Email => { + let offset = match link_type { + // We don't know how the link starts, so don't offset the span. + LinkType::Reference | LinkType::CollapsedUnknown | LinkType::ShortcutUnknown => 0, + // These start all with `[` or `<`, so offset the span by `1`. + LinkType::ReferenceUnknown | LinkType::Collapsed | LinkType::Shortcut | LinkType::Autolink | LinkType::Email => 1, + _ => { + debug_assert!(false, "unreachable"); + 0 + } + }; + Some(extract_raw_uri_from_plaintext(&dest_url, &OffsetSpanProvider { offset: span.start + offset, inner: &span_provider, })) + } // Wiki URL (`[[http://example.com]]`) LinkType::WikiLink { has_pothole: _ } => { inside_link_block = true; - //Ignore gitlab toc notation: https://docs.gitlab.com/user/markdown/#table-of-contents + // Ignore gitlab toc notation: https://docs.gitlab.com/user/markdown/#table-of-contents if ["_TOC_".to_string(), "TOC".to_string()].contains(&dest_url.to_string()) { return None; } @@ -73,6 +93,8 @@ pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec Vec Vec Vec { // This won't exclude verbatim links right now, because HTML gets passed in chunks // by pulldown_cmark. So excluding `
` and `` is not handled right now.
-                Some(extract_html(&html, include_verbatim))
+                Some(extract_html_with_span(
+                    &html,
+                    include_verbatim,
+                    OffsetSpanProvider { offset: span.start, inner: &span_provider }
+                ))
             }
 
             // An inline code node.
             Event::Code(code) => {
                 if include_verbatim {
-                    Some(extract_raw_uri_from_plaintext(&code))
+                    // inline code starts with '`', so offset the span by `1`.
+                    Some(extract_raw_uri_from_plaintext(
+                        &code,
+                        &OffsetSpanProvider { offset: span.start + 1, inner: &span_provider }
+                    ))
                 } else {
                     None
                 }
@@ -228,6 +262,8 @@ impl HeadingIdGenerator {
 
 #[cfg(test)]
 mod tests {
+    use crate::types::uri::raw::span;
+
     use super::*;
 
     const MD_INPUT: &str = r#"
@@ -272,11 +308,13 @@ or inline like `https://bar.org` for instance.
                 text: "https://foo.com".to_string(),
                 element: Some("a".to_string()),
                 attribute: Some("href".to_string()),
+                span: span(4, 19),
             },
             RawUri {
                 text: "http://example.com".to_string(),
                 element: Some("a".to_string()),
                 attribute: Some("href".to_string()),
+                span: span(18, 1),
             },
         ];
 
@@ -291,21 +329,25 @@ or inline like `https://bar.org` for instance.
                 text: "https://foo.com".to_string(),
                 element: Some("a".to_string()),
                 attribute: Some("href".to_string()),
+                span: span(4, 19),
             },
             RawUri {
                 text: "https://bar.com/123".to_string(),
                 element: None,
                 attribute: None,
+                span: span(11, 1),
             },
             RawUri {
                 text: "https://bar.org".to_string(),
                 element: None,
                 attribute: None,
+                span: span(14, 17),
             },
             RawUri {
                 text: "http://example.com".to_string(),
                 element: Some("a".to_string()),
                 attribute: Some("href".to_string()),
+                span: span(18, 1),
             },
         ];
 
@@ -377,6 +419,7 @@ $$
             text: "https://example.com/_/foo".to_string(),
             element: None,
             attribute: None,
+            span: span(1, 1),
         }];
         let uris = extract_markdown(markdown, true);
         assert_eq!(uris, expected);
@@ -389,6 +432,7 @@ $$
             text: "https://example.com/_".to_string(),
             element: None,
             attribute: None,
+            span: span(1, 1),
         }];
         let uris = extract_markdown(markdown, true);
         assert_eq!(uris, expected);
@@ -401,6 +445,7 @@ $$
             text: "https://example.com/destination".to_string(),
             element: Some("a".to_string()),
             attribute: Some("href".to_string()),
+            span: span(1, 3),
         }];
         let uris = extract_markdown(markdown, true);
         assert_eq!(uris, expected);
@@ -414,11 +459,13 @@ $$
                 text: "https://example.com/destination".to_string(),
                 element: Some("a".to_string()),
                 attribute: Some("href".to_string()),
+                span: span(1, 3),
             },
             RawUri {
                 text: "https://example.com/source".to_string(),
                 element: Some("a".to_string()),
                 attribute: Some("href".to_string()),
+                span: span(1, 38),
             },
         ];
         let uris = extract_markdown(markdown, true);
diff --git a/lychee-lib/src/extract/mod.rs b/lychee-lib/src/extract/mod.rs
index 91b48078d8..9c6aeec544 100644
--- a/lychee-lib/src/extract/mod.rs
+++ b/lychee-lib/src/extract/mod.rs
@@ -1,4 +1,7 @@
-use crate::types::{FileType, InputContent, uri::raw::RawUri};
+use crate::types::{
+    FileType, InputContent,
+    uri::raw::{RawUri, SourceSpanProvider},
+};
 
 pub mod html;
 pub mod markdown;
@@ -50,7 +53,10 @@ impl Extractor {
                     html::html5gum::extract_html(&input_content.content, self.include_verbatim)
                 }
             }
-            FileType::Plaintext => extract_raw_uri_from_plaintext(&input_content.content),
+            FileType::Plaintext => extract_raw_uri_from_plaintext(
+                &input_content.content,
+                &SourceSpanProvider::from_input(&input_content.content),
+            ),
         }
     }
 }
diff --git a/lychee-lib/src/extract/plaintext.rs b/lychee-lib/src/extract/plaintext.rs
index 95ff2e6927..fb998415a6 100644
--- a/lychee-lib/src/extract/plaintext.rs
+++ b/lychee-lib/src/extract/plaintext.rs
@@ -1,25 +1,42 @@
-use crate::{types::uri::raw::RawUri, utils::url};
+use crate::{
+    types::uri::raw::{RawUri, SpanProvider},
+    utils::url,
+};
 
 /// Extract unparsed URL strings from plaintext
-pub(crate) fn extract_raw_uri_from_plaintext(input: &str) -> Vec {
+pub(crate) fn extract_raw_uri_from_plaintext(
+    input: &str,
+    span_provider: &impl SpanProvider,
+) -> Vec {
     url::find_links(input)
-        .map(|uri| RawUri::from(uri.as_str()))
+        .map(|uri| RawUri {
+            text: uri.as_str().to_owned(),
+            element: None,
+            attribute: None,
+            span: span_provider.span(uri.start()),
+        })
         .collect()
 }
 
 #[cfg(test)]
 mod tests {
+    use crate::types::uri::raw::{SourceSpanProvider, span};
+
     use super::*;
 
+    fn extract(input: &str) -> Vec {
+        extract_raw_uri_from_plaintext(input, &SourceSpanProvider::from_input(input))
+    }
+
     #[test]
     fn test_extract_local_links() {
         let input = "http://127.0.0.1/ and http://127.0.0.1:8888/ are local links.";
-        let links: Vec = extract_raw_uri_from_plaintext(input);
+        let links: Vec = extract(input);
         assert_eq!(
             links,
             [
-                RawUri::from("http://127.0.0.1/"),
-                RawUri::from("http://127.0.0.1:8888/")
+                RawUri::from(("http://127.0.0.1/", span(1, 1))),
+                RawUri::from(("http://127.0.0.1:8888/", span(1, 23),)),
             ]
         );
     }
@@ -27,9 +44,9 @@ mod tests {
     #[test]
     fn test_extract_link_at_end_of_line() {
         let input = "https://www.apache.org/licenses/LICENSE-2.0\n";
-        let uri = RawUri::from(input.trim_end());
+        let uri = RawUri::from((input.trim_end(), span(1, 1)));
 
-        let uris: Vec = extract_raw_uri_from_plaintext(input);
+        let uris: Vec = extract(input);
         assert_eq!(vec![uri], uris);
     }
 }
diff --git a/lychee-lib/src/types/uri/raw.rs b/lychee-lib/src/types/uri/raw.rs
index 3ad51f2cf8..9c1aa37b0e 100644
--- a/lychee-lib/src/types/uri/raw.rs
+++ b/lychee-lib/src/types/uri/raw.rs
@@ -1,4 +1,4 @@
-use std::fmt::Display;
+use std::{fmt::Display, num::NonZeroUsize};
 
 /// A raw URI that got extracted from a document with a fuzzy parser.
 /// Note that this can still be invalid according to stricter URI standards
@@ -17,6 +17,8 @@ pub struct RawUri {
     /// that will be checked e.g. by trying to filter out links that were found
     /// in unwanted attributes like `srcset` or `manifest`.
     pub attribute: Option,
+    /// The position of the URI in the document.
+    pub span: RawUriSpan,
 }
 
 impl Display for RawUri {
@@ -25,12 +27,124 @@ impl Display for RawUri {
     }
 }
 
-impl From<&str> for RawUri {
-    fn from(text: &str) -> Self {
+#[cfg(test)]
+impl From<(&str, RawUriSpan)> for RawUri {
+    fn from((text, span): (&str, RawUriSpan)) -> Self {
         RawUri {
             text: text.to_string(),
             element: None,
             attribute: None,
+            span,
         }
     }
 }
+
+/// A span of a [`RawUri`] in the document.
+///
+/// The span can be used to give more precise error messages.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub struct RawUriSpan {
+    /// The line of the URI.
+    ///
+    /// The line is 1-based.
+    pub line: NonZeroUsize,
+    /// The column of the URI if computable.
+    ///
+    /// The column is 1-based.
+    /// This is `None`, if the column can't be computed exactly,
+    /// e.g. when it comes from the `html5ever` parser.
+    pub column: Option,
+}
+
+/// Test helper to create [`RawUriSpan`]s easily.
+#[cfg(test)]
+pub const fn span(line: usize, column: usize) -> RawUriSpan {
+    RawUriSpan {
+        line: NonZeroUsize::new(line).unwrap(),
+        column: Some(NonZeroUsize::new(column).unwrap()),
+    }
+}
+
+/// A trait for calculating a [`RawUriSpan`] at a given byte offset in the document.
+///
+/// If you have a document and want spans with absolute positions, use [`SourceSpanProvider`].
+/// If you start inside a document at a given offset, use [`OffsetSpanProvider`].
+pub trait SpanProvider {
+    /// Compute the [`RawUriSpan`] at a given byte offset in the document.
+    fn span(&self, offset: usize) -> RawUriSpan;
+}
+
+/// A [`SpanProvider`] which calculates spans depending on the input lines.
+///
+/// Precomputes line lengths so that constructing [`RawUriSpan`]s is faster.
+/// If you start inside a document at a given offset, consider using [`OffsetSpanProvider`].
+#[derive(Clone, Debug)]
+pub struct SourceSpanProvider<'a> {
+    /// The computed map from line number to offset in the document.
+    line_starts: Vec,
+    /// The input document.
+    ///
+    /// This is used to compute column information, since we can't rely on each character being a
+    /// single byte long.
+    input: &'a str,
+}
+
+impl<'a> SourceSpanProvider<'a> {
+    /// Create a [`SpanProvider`] from the given document.
+    ///
+    /// If the input is part of a larger document, consider using [`OffsetSpanProvider`] instead.
+    ///
+    /// This function isn't just a simple constructor but does some work, so call this only if you
+    /// want to use it.
+    pub fn from_input(input: &'a str) -> Self {
+        // FIXME: Consider making this lazy?
+        let line_starts: Vec<_> = core::iter::once(0)
+            .chain(input.match_indices('\n').map(|(i, _)| i + 1))
+            .collect();
+        Self { line_starts, input }
+    }
+}
+
+impl SpanProvider for SourceSpanProvider<'_> {
+    fn span(&self, offset: usize) -> RawUriSpan {
+        const ONE: NonZeroUsize = NonZeroUsize::MIN;
+        let line = match self.line_starts.binary_search(&offset) {
+            Ok(i) => i,
+            Err(i) => i - 1,
+        };
+        // Since we get the index by the binary_search above and subtract `1` if it would be larger
+        // than the length of the document, this shouldn't panic.
+        let line_offset = self.line_starts[line];
+        let column = self
+            .input
+            .get(line_offset..offset)
+            .or_else(|| self.input.get(line_offset..))
+            // columns are 1-based
+            .map(|v| ONE.saturating_add(v.chars().count()));
+
+        RawUriSpan {
+            // lines are 1-based
+            line: ONE.saturating_add(line),
+            column,
+        }
+    }
+}
+
+/// A [`SpanProvider`] which starts at a given offset in the document.
+///
+/// All given offsets are changed by the given amount before computing the
+/// resulting [`RawUriSpan`] with the inner [`SpanProvider`].
+#[derive(Clone, Debug)]
+pub struct OffsetSpanProvider<'a, T: SpanProvider = SourceSpanProvider<'a>> {
+    /// The byte offset in the document by which all given offsets are changed before computing the
+    /// resulting [`RawUriSpan`] with the inner [`SpanProvider`].
+    pub offset: usize,
+    /// The inner [`SpanProvider`] which will be used to determine the spans.
+    pub inner: &'a T,
+}
+
+impl SpanProvider for OffsetSpanProvider<'_, T> {
+    fn span(&self, offset: usize) -> RawUriSpan {
+        self.inner.span(self.offset + offset)
+    }
+}
diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs
index 6b57b9ee8c..cdf719f9cd 100644
--- a/lychee-lib/src/utils/request.rs
+++ b/lychee-lib/src/utils/request.rs
@@ -206,8 +206,24 @@ fn prepend_root_dir_if_absolute_local_link(text: &str, root_dir: Option<&PathBuf
 
 #[cfg(test)]
 mod tests {
+    use std::num::NonZeroUsize;
+
+    use crate::types::uri::raw::RawUriSpan;
+
     use super::*;
 
+    fn raw_uri(text: &'static str) -> RawUri {
+        RawUri {
+            text: text.to_string(),
+            element: None,
+            attribute: None,
+            span: RawUriSpan {
+                line: NonZeroUsize::MAX,
+                column: None,
+            },
+        }
+    }
+
     #[test]
     fn test_is_anchor() {
         assert!(is_anchor("#anchor"));
@@ -226,7 +242,7 @@ mod tests {
         let base = Base::try_from("https://example.com/path/page.html").unwrap();
         let source = InputSource::String(String::new());
 
-        let uris = vec![RawUri::from("relative.html")];
+        let uris = vec![raw_uri("relative.html")];
         let requests = create(uris, &source, None, Some(&base), None);
 
         assert_eq!(requests.len(), 1);
@@ -242,7 +258,7 @@ mod tests {
         let base = Base::try_from("https://example.com/path/page.html").unwrap();
         let source = InputSource::String(String::new());
 
-        let uris = vec![RawUri::from("https://another.com/page")];
+        let uris = vec![raw_uri("https://another.com/page")];
         let requests = create(uris, &source, None, Some(&base), None);
 
         assert_eq!(requests.len(), 1);
@@ -258,7 +274,7 @@ mod tests {
         let base = Base::try_from("https://example.com/path/page.html").unwrap();
         let source = InputSource::String(String::new());
 
-        let uris = vec![RawUri::from("/root-relative")];
+        let uris = vec![raw_uri("/root-relative")];
         let requests = create(uris, &source, None, Some(&base), None);
 
         assert_eq!(requests.len(), 1);
@@ -274,7 +290,7 @@ mod tests {
         let base = Base::try_from("https://example.com/path/page.html").unwrap();
         let source = InputSource::String(String::new());
 
-        let uris = vec![RawUri::from("../parent")];
+        let uris = vec![raw_uri("../parent")];
         let requests = create(uris, &source, None, Some(&base), None);
 
         assert_eq!(requests.len(), 1);
@@ -290,7 +306,7 @@ mod tests {
         let base = Base::try_from("https://example.com/path/page.html").unwrap();
         let source = InputSource::String(String::new());
 
-        let uris = vec![RawUri::from("#fragment")];
+        let uris = vec![raw_uri("#fragment")];
         let requests = create(uris, &source, None, Some(&base), None);
 
         assert_eq!(requests.len(), 1);
@@ -306,7 +322,7 @@ mod tests {
         let root_dir = PathBuf::from("/tmp/lychee");
         let source = InputSource::FsPath(PathBuf::from("/some/page.html"));
 
-        let uris = vec![RawUri::from("relative.html")];
+        let uris = vec![raw_uri("relative.html")];
         let requests = create(uris, &source, Some(&root_dir), None, None);
 
         assert_eq!(requests.len(), 1);
@@ -322,7 +338,7 @@ mod tests {
         let root_dir = PathBuf::from("/tmp/lychee");
         let source = InputSource::FsPath(PathBuf::from("/some/page.html"));
 
-        let uris = vec![RawUri::from("https://another.com/page")];
+        let uris = vec![raw_uri("https://another.com/page")];
         let requests = create(uris, &source, Some(&root_dir), None, None);
 
         assert_eq!(requests.len(), 1);
@@ -338,7 +354,7 @@ mod tests {
         let root_dir = PathBuf::from("/tmp/lychee");
         let source = InputSource::FsPath(PathBuf::from("/some/page.html"));
 
-        let uris = vec![RawUri::from("/root-relative")];
+        let uris = vec![raw_uri("/root-relative")];
         let requests = create(uris, &source, Some(&root_dir), None, None);
 
         assert_eq!(requests.len(), 1);
@@ -354,7 +370,7 @@ mod tests {
         let root_dir = PathBuf::from("/tmp/lychee");
         let source = InputSource::FsPath(PathBuf::from("/some/page.html"));
 
-        let uris = vec![RawUri::from("../parent")];
+        let uris = vec![raw_uri("../parent")];
         let requests = create(uris, &source, Some(&root_dir), None, None);
 
         assert_eq!(requests.len(), 1);
@@ -370,7 +386,7 @@ mod tests {
         let root_dir = PathBuf::from("/tmp/lychee");
         let source = InputSource::FsPath(PathBuf::from("/some/page.html"));
 
-        let uris = vec![RawUri::from("#fragment")];
+        let uris = vec![raw_uri("#fragment")];
         let requests = create(uris, &source, Some(&root_dir), None, None);
 
         assert_eq!(requests.len(), 1);
@@ -387,7 +403,7 @@ mod tests {
         let base = Base::try_from("https://example.com/path/page.html").unwrap();
         let source = InputSource::FsPath(PathBuf::from("/some/page.html"));
 
-        let uris = vec![RawUri::from("relative.html")];
+        let uris = vec![raw_uri("relative.html")];
         let requests = create(uris, &source, Some(&root_dir), Some(&base), None);
 
         assert_eq!(requests.len(), 1);
@@ -404,7 +420,7 @@ mod tests {
         let base = Base::try_from("https://example.com/path/page.html").unwrap();
         let source = InputSource::FsPath(PathBuf::from("/some/page.html"));
 
-        let uris = vec![RawUri::from("https://another.com/page")];
+        let uris = vec![raw_uri("https://another.com/page")];
         let requests = create(uris, &source, Some(&root_dir), Some(&base), None);
 
         assert_eq!(requests.len(), 1);
@@ -421,7 +437,7 @@ mod tests {
         let base = Base::try_from("https://example.com/path/page.html").unwrap();
         let source = InputSource::FsPath(PathBuf::from("/some/page.html"));
 
-        let uris = vec![RawUri::from("/root-relative")];
+        let uris = vec![raw_uri("/root-relative")];
         let requests = create(uris, &source, Some(&root_dir), Some(&base), None);
 
         assert_eq!(requests.len(), 1);
@@ -438,7 +454,7 @@ mod tests {
         let base = Base::try_from("https://example.com/path/page.html").unwrap();
         let source = InputSource::FsPath(PathBuf::from("/some/page.html"));
 
-        let uris = vec![RawUri::from("../parent")];
+        let uris = vec![raw_uri("../parent")];
         let requests = create(uris, &source, Some(&root_dir), Some(&base), None);
 
         assert_eq!(requests.len(), 1);
@@ -455,7 +471,7 @@ mod tests {
         let base = Base::try_from("https://example.com/path/page.html").unwrap();
         let source = InputSource::FsPath(PathBuf::from("/some/page.html"));
 
-        let uris = vec![RawUri::from("#fragment")];
+        let uris = vec![raw_uri("#fragment")];
         let requests = create(uris, &source, Some(&root_dir), Some(&base), None);
 
         assert_eq!(requests.len(), 1);
@@ -470,7 +486,7 @@ mod tests {
     fn test_no_base_url_resolution() {
         let source = InputSource::String(String::new());
 
-        let uris = vec![RawUri::from("https://example.com/page")];
+        let uris = vec![raw_uri("https://example.com/page")];
         let requests = create(uris, &source, None, None, None);
 
         assert_eq!(requests.len(), 1);
@@ -487,7 +503,7 @@ mod tests {
         let input_source = InputSource::FsPath(PathBuf::from("page.html"));
 
         let actual = create_request(
-            &RawUri::from("file.html"),
+            &raw_uri("file.html"),
             &input_source,
             None,
             Some(&base),
@@ -516,7 +532,7 @@ mod tests {
 
         // Use an absolute path that's outside the base directory
         let actual = create_request(
-            &RawUri::from("/usr/local/share/doc/example.html"),
+            &raw_uri("/usr/local/share/doc/example.html"),
             &input_source,
             None,
             Some(&base),
@@ -543,7 +559,7 @@ mod tests {
         let base = Base::Local(PathBuf::from("/tmp/lychee"));
         let source = InputSource::String(String::new());
 
-        let raw_uri = RawUri::from("relative.html");
+        let raw_uri = raw_uri("relative.html");
         let uri = try_parse_into_uri(&raw_uri, &source, None, Some(&base)).unwrap();
 
         assert_eq!(uri.url.as_str(), "file:///tmp/lychee/relative.html");
@@ -554,7 +570,7 @@ mod tests {
         let base = Base::Local(PathBuf::from("/tmp/lychee"));
         let source = InputSource::String(String::new());
 
-        let raw_uri = RawUri::from("absolute.html");
+        let raw_uri = raw_uri("absolute.html");
         let uri = try_parse_into_uri(&raw_uri, &source, None, Some(&base)).unwrap();
 
         assert_eq!(uri.url.as_str(), "file:///tmp/lychee/absolute.html");