diff --git a/lychee-lib/src/extract/html/srcset.rs b/lychee-lib/src/extract/html/srcset.rs
index e557a719d0..69b45faa74 100644
--- a/lychee-lib/src/extract/html/srcset.rs
+++ b/lychee-lib/src/extract/html/srcset.rs
@@ -20,6 +20,7 @@
//! use a state machine to keep track of the current state.
use log::info;
+use std::result::Result;
enum State {
InsideDescriptor,
@@ -29,6 +30,9 @@ enum State {
/// Split an input string at the first character for which
/// the predicate returns false.
+///
+/// In other words, returns the longest prefix span where `predicate` is
+/// satisfied, along with the rest of the string.
fn split_at(input: &str, predicate: F) -> (&str, &str)
where
F: Fn(&char) -> bool,
@@ -47,77 +51,85 @@ where
// for simplicity so we have to please clippy.
pub(crate) fn parse(input: &str) -> Vec<&str> {
let mut candidates: Vec<&str> = Vec::new();
- let mut index = 0;
-
- while index < input.len() {
- let position = &input[index..];
- let (start, remaining) = split_at(position, |c| *c == ',' || c.is_whitespace());
-
- if start.find(',').is_some() {
- info!("srcset parse Error");
- return vec![];
+ let mut remaining = input;
+ while !remaining.is_empty() {
+ remaining = match parse_one_url(remaining) {
+ Ok((rem, None)) => rem,
+ Ok((rem, Some(url))) => {
+ candidates.push(url);
+ rem
+ }
+ Err(e) => {
+ info!("{e}");
+ return vec![];
+ }
}
- index += start.chars().count();
+ }
- if remaining.is_empty() {
- return candidates;
- }
+ candidates
+}
- let (url, remaining) = split_at(remaining, |c| !c.is_whitespace());
- index += url.chars().count();
+/// Implements one iteration of the "splitting loop" from the reference algorithm.
+/// This is intended to be repeatedly called until the remaining string is empty.
+///
+/// Returns a tuple of remaining string and an optional parsed URL, if successful.
+/// Otherwise, in case of srcset syntax errors, returns Err.
+///
+///
+fn parse_one_url(remaining: &str) -> Result<(&str, Option<&str>), String> {
+ let (start, remaining) = split_at(remaining, |c| *c == ',' || c.is_ascii_whitespace());
+
+ if start.find(',').is_some() {
+ return Err("srcset parse error (too many commas)".to_string());
+ }
- let comma_count = url.chars().rev().take_while(|c| *c == ',').count();
+ if remaining.is_empty() {
+ return Ok(("", None));
+ }
- if let Some(url) = url.get(..url.len() - comma_count) {
- candidates.push(url);
- }
+ let (url, remaining) = split_at(remaining, |c| !c.is_ascii_whitespace());
- if comma_count > 1 {
- info!("srcset parse error (trailing commas)");
- return vec![];
- }
+ let comma_count = url.chars().rev().take_while(|c| *c == ',').count();
+ if comma_count > 1 {
+ return Err("srcset parse error (trailing commas)".to_string());
+ }
- index += 1;
+ let url = url.get(..url.len() - comma_count);
- let (space, remaining) = split_at(remaining, |c| c.is_whitespace());
- index += space.len();
+ let (_spaces, remaining) = split_at(remaining, char::is_ascii_whitespace);
- index = skip_descriptor(index, remaining);
- }
+ let remaining = skip_descriptor(remaining);
- candidates
+ Ok((remaining, url))
}
-/// Helper function to skip over a descriptor.
-/// Returns the index of the next character after the descriptor
-/// (i.e. pointing at the comma or the end of the string)
-fn skip_descriptor(mut index: usize, remaining: &str) -> usize {
+/// Helper function to skip over a descriptor. Returns the string remaining
+/// after the descriptor (i.e. a string beginning after the next comma or an
+/// empty string).
+#[allow(clippy::single_match)]
+fn skip_descriptor(remaining: &str) -> &str {
let mut state = State::InsideDescriptor;
- for c in remaining.chars() {
- index += 1;
-
+ for (i, c) in remaining.char_indices() {
match state {
State::InsideDescriptor => match c {
- ' ' => state = State::AfterDescriptor,
+ c if c.is_ascii_whitespace() => state = State::AfterDescriptor,
'(' => state = State::InsideParens,
- ',' => return index,
- _ => {}
+ ',' => return &remaining[i + c.len_utf8()..], // returns string after this comma
+ _ => (),
+ },
+ State::InsideParens => match c {
+ ')' => state = State::InsideDescriptor,
+ _ => (),
+ },
+ State::AfterDescriptor => match c {
+ c if c.is_ascii_whitespace() => (),
+ _ => state = State::InsideDescriptor,
},
- State::InsideParens => {
- if c == ')' {
- state = State::InsideDescriptor;
- }
- }
- State::AfterDescriptor => {
- if c != ' ' {
- state = State::InsideDescriptor;
- }
- }
}
}
- index
+ ""
}
#[cfg(test)]
@@ -217,4 +229,20 @@ mod tests {
]
);
}
+
+ #[test]
+ fn test_parse_srcset_without_spaces() {
+ assert_eq!(
+ parse(
+ "/300.png 300w,/600.png 600w,/900.png 900w,https://x.invalid/a.png 1000w,relative.png 10w"
+ ),
+ vec![
+ "/300.png",
+ "/600.png",
+ "/900.png",
+ "https://x.invalid/a.png",
+ "relative.png"
+ ]
+ );
+ }
}