diff --git a/lychee-lib/src/extract/html/srcset.rs b/lychee-lib/src/extract/html/srcset.rs index e557a719d0..69b45faa74 100644 --- a/lychee-lib/src/extract/html/srcset.rs +++ b/lychee-lib/src/extract/html/srcset.rs @@ -20,6 +20,7 @@ //! use a state machine to keep track of the current state. use log::info; +use std::result::Result; enum State { InsideDescriptor, @@ -29,6 +30,9 @@ enum State { /// Split an input string at the first character for which /// the predicate returns false. +/// +/// In other words, returns the longest prefix span where `predicate` is +/// satisfied, along with the rest of the string. fn split_at(input: &str, predicate: F) -> (&str, &str) where F: Fn(&char) -> bool, @@ -47,77 +51,85 @@ where // for simplicity so we have to please clippy. pub(crate) fn parse(input: &str) -> Vec<&str> { let mut candidates: Vec<&str> = Vec::new(); - let mut index = 0; - - while index < input.len() { - let position = &input[index..]; - let (start, remaining) = split_at(position, |c| *c == ',' || c.is_whitespace()); - - if start.find(',').is_some() { - info!("srcset parse Error"); - return vec![]; + let mut remaining = input; + while !remaining.is_empty() { + remaining = match parse_one_url(remaining) { + Ok((rem, None)) => rem, + Ok((rem, Some(url))) => { + candidates.push(url); + rem + } + Err(e) => { + info!("{e}"); + return vec![]; + } } - index += start.chars().count(); + } - if remaining.is_empty() { - return candidates; - } + candidates +} - let (url, remaining) = split_at(remaining, |c| !c.is_whitespace()); - index += url.chars().count(); +/// Implements one iteration of the "splitting loop" from the reference algorithm. +/// This is intended to be repeatedly called until the remaining string is empty. +/// +/// Returns a tuple of remaining string and an optional parsed URL, if successful. +/// Otherwise, in case of srcset syntax errors, returns Err. +/// +/// +fn parse_one_url(remaining: &str) -> Result<(&str, Option<&str>), String> { + let (start, remaining) = split_at(remaining, |c| *c == ',' || c.is_ascii_whitespace()); + + if start.find(',').is_some() { + return Err("srcset parse error (too many commas)".to_string()); + } - let comma_count = url.chars().rev().take_while(|c| *c == ',').count(); + if remaining.is_empty() { + return Ok(("", None)); + } - if let Some(url) = url.get(..url.len() - comma_count) { - candidates.push(url); - } + let (url, remaining) = split_at(remaining, |c| !c.is_ascii_whitespace()); - if comma_count > 1 { - info!("srcset parse error (trailing commas)"); - return vec![]; - } + let comma_count = url.chars().rev().take_while(|c| *c == ',').count(); + if comma_count > 1 { + return Err("srcset parse error (trailing commas)".to_string()); + } - index += 1; + let url = url.get(..url.len() - comma_count); - let (space, remaining) = split_at(remaining, |c| c.is_whitespace()); - index += space.len(); + let (_spaces, remaining) = split_at(remaining, char::is_ascii_whitespace); - index = skip_descriptor(index, remaining); - } + let remaining = skip_descriptor(remaining); - candidates + Ok((remaining, url)) } -/// Helper function to skip over a descriptor. -/// Returns the index of the next character after the descriptor -/// (i.e. pointing at the comma or the end of the string) -fn skip_descriptor(mut index: usize, remaining: &str) -> usize { +/// Helper function to skip over a descriptor. Returns the string remaining +/// after the descriptor (i.e. a string beginning after the next comma or an +/// empty string). +#[allow(clippy::single_match)] +fn skip_descriptor(remaining: &str) -> &str { let mut state = State::InsideDescriptor; - for c in remaining.chars() { - index += 1; - + for (i, c) in remaining.char_indices() { match state { State::InsideDescriptor => match c { - ' ' => state = State::AfterDescriptor, + c if c.is_ascii_whitespace() => state = State::AfterDescriptor, '(' => state = State::InsideParens, - ',' => return index, - _ => {} + ',' => return &remaining[i + c.len_utf8()..], // returns string after this comma + _ => (), + }, + State::InsideParens => match c { + ')' => state = State::InsideDescriptor, + _ => (), + }, + State::AfterDescriptor => match c { + c if c.is_ascii_whitespace() => (), + _ => state = State::InsideDescriptor, }, - State::InsideParens => { - if c == ')' { - state = State::InsideDescriptor; - } - } - State::AfterDescriptor => { - if c != ' ' { - state = State::InsideDescriptor; - } - } } } - index + "" } #[cfg(test)] @@ -217,4 +229,20 @@ mod tests { ] ); } + + #[test] + fn test_parse_srcset_without_spaces() { + assert_eq!( + parse( + "/300.png 300w,/600.png 600w,/900.png 900w,https://x.invalid/a.png 1000w,relative.png 10w" + ), + vec![ + "/300.png", + "/600.png", + "/900.png", + "https://x.invalid/a.png", + "relative.png" + ] + ); + } }