From eaa1542089714349918ced3a1ffdb2ffde9df9e5 Mon Sep 17 00:00:00 2001 From: rina Date: Tue, 28 Oct 2025 10:52:58 +1000 Subject: [PATCH 1/5] fix: srcset parsing of URLs after the first previously, the first character of srcset URLs past the first one had their first character dropped. this led to absolute URLs becoming relative and other big problems. this happened when the srcset did not have spaces after its commas: `/300.png 300w,/600.png 600w` really, this could be fixed by simply deleting the `index += 1;` line. this increment causes a mismatch between the index and the remaining string which causes the off-by-one error. this pr makes some extra changes to avoid this duplicated logic by removing the index variable entirely, since the remaining string is all you need. this avoids needing to think about indices and should avoid similar bugs in future. this pr also changes some pattern match of space to accept other ASCII whitespace, following the spec. for those curious, the new test case previously failed with this error: ``` left: ["/300.png", "600.png", "900.png", "ttps://x.invalid/a.png", "elative.png"] right: ["/300.png", "/600.png", "/900.png", "https://x.invalid/a.png", "relative.png"] ``` related to https://www.github.com/lycheeverse/lychee/issues/1190 --- lychee-lib/src/extract/html/srcset.rs | 106 ++++++++++++++++---------- 1 file changed, 66 insertions(+), 40 deletions(-) diff --git a/lychee-lib/src/extract/html/srcset.rs b/lychee-lib/src/extract/html/srcset.rs index e557a719d0..b919c0641b 100644 --- a/lychee-lib/src/extract/html/srcset.rs +++ b/lychee-lib/src/extract/html/srcset.rs @@ -29,6 +29,9 @@ enum State { /// Split an input string at the first character for which /// the predicate returns false. +/// +/// In other words, returns the longest prefix span where `predicate` is +/// satisfied, along with the rest of the string. fn split_at(input: &str, predicate: F) -> (&str, &str) where F: Fn(&char) -> bool, @@ -46,78 +49,85 @@ where // This state-machine is a bit convoluted, but we keep everything in one place // for simplicity so we have to please clippy. pub(crate) fn parse(input: &str) -> Vec<&str> { - let mut candidates: Vec<&str> = Vec::new(); - let mut index = 0; - - while index < input.len() { - let position = &input[index..]; - let (start, remaining) = split_at(position, |c| *c == ',' || c.is_whitespace()); + /// Implements one iteration of the "splitting loop" from the reference algorithm. + /// This is intended to be repeatedly called until the remaining string is empty. + /// + /// Returns a tuple of remaining string and parsed URL (if any). + /// + /// + fn parse_one_url(remaining: &str) -> (&str, Option<&str>) { + let (start, remaining) = split_at(remaining, |c| *c == ',' || c.is_whitespace()); if start.find(',').is_some() { info!("srcset parse Error"); - return vec![]; + return ("", None); } - index += start.chars().count(); if remaining.is_empty() { - return candidates; + return ("", None); } let (url, remaining) = split_at(remaining, |c| !c.is_whitespace()); - index += url.chars().count(); let comma_count = url.chars().rev().take_while(|c| *c == ',').count(); - - if let Some(url) = url.get(..url.len() - comma_count) { - candidates.push(url); - } - if comma_count > 1 { info!("srcset parse error (trailing commas)"); - return vec![]; + return ("", None); } - index += 1; + let url = url.get(..url.len() - comma_count); - let (space, remaining) = split_at(remaining, |c| c.is_whitespace()); - index += space.len(); + let (_spaces, remaining) = split_at(remaining, |c| c.is_whitespace()); - index = skip_descriptor(index, remaining); + let remaining = skip_descriptor(remaining); + + (remaining, url) + } + + let mut candidates: Vec<&str> = Vec::new(); + let mut remaining = input; + while !remaining.is_empty() { + let (new_remaining, url) = parse_one_url(remaining); + if let Some(url) = url { + candidates.push(url); + } + remaining = new_remaining; } candidates } -/// Helper function to skip over a descriptor. -/// Returns the index of the next character after the descriptor -/// (i.e. pointing at the comma or the end of the string) -fn skip_descriptor(mut index: usize, remaining: &str) -> usize { +/// Helper function to skip over a descriptor. Returns the string remaining +/// after the descriptor (i.e. a string beginning after the next comma or an +/// empty string). +#[allow(clippy::single_match)] +fn skip_descriptor(remaining: &str) -> &str { let mut state = State::InsideDescriptor; - for c in remaining.chars() { - index += 1; + // ASCII whitespace is U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020 SPACE. + // https://infra.spec.whatwg.org/#ascii-whitespace + let mut it = remaining.chars(); + while let Some(c) = it.next() { match state { State::InsideDescriptor => match c { - ' ' => state = State::AfterDescriptor, + ' ' | '\t' | '\n' | '\u{000C}' | '\r' => state = State::AfterDescriptor, '(' => state = State::InsideParens, - ',' => return index, - _ => {} + ',' => return it.as_str(), // returns string after this comma + _ => (), + }, + State::InsideParens => match c { + ')' => state = State::InsideDescriptor, + _ => (), + }, + State::AfterDescriptor => match c { + ' ' | '\t' | '\n' | '\u{000C}' | '\r' => (), + _ => state = State::InsideDescriptor, }, - State::InsideParens => { - if c == ')' { - state = State::InsideDescriptor; - } - } - State::AfterDescriptor => { - if c != ' ' { - state = State::InsideDescriptor; - } - } } } - index + "" } #[cfg(test)] @@ -217,4 +227,20 @@ mod tests { ] ); } + + #[test] + fn test_parse_srcset_without_spaces() { + assert_eq!( + parse( + "/300.png 300w,/600.png 600w,/900.png 900w,https://x.invalid/a.png 1000w,relative.png 10w" + ), + vec![ + "/300.png", + "/600.png", + "/900.png", + "https://x.invalid/a.png", + "relative.png" + ] + ); + } } From 316805992ec17bbfde41959044a586a8a4685fe7 Mon Sep 17 00:00:00 2001 From: rina Date: Tue, 28 Oct 2025 11:52:21 +1000 Subject: [PATCH 2/5] distinguish errors from end of string. should the info messages be a warning instead? --- lychee-lib/src/extract/html/srcset.rs | 30 ++++++++++++++++----------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/lychee-lib/src/extract/html/srcset.rs b/lychee-lib/src/extract/html/srcset.rs index b919c0641b..9d883e6858 100644 --- a/lychee-lib/src/extract/html/srcset.rs +++ b/lychee-lib/src/extract/html/srcset.rs @@ -20,6 +20,7 @@ //! use a state machine to keep track of the current state. use log::info; +use std::result::Result; enum State { InsideDescriptor, @@ -52,27 +53,26 @@ pub(crate) fn parse(input: &str) -> Vec<&str> { /// Implements one iteration of the "splitting loop" from the reference algorithm. /// This is intended to be repeatedly called until the remaining string is empty. /// - /// Returns a tuple of remaining string and parsed URL (if any). + /// Returns a tuple of remaining string and an optional parsed URL, if successful. + /// Otherwise, in case of srcset syntax errors, returns Err. /// /// - fn parse_one_url(remaining: &str) -> (&str, Option<&str>) { + fn parse_one_url(remaining: &str) -> Result<(&str, Option<&str>), String> { let (start, remaining) = split_at(remaining, |c| *c == ',' || c.is_whitespace()); if start.find(',').is_some() { - info!("srcset parse Error"); - return ("", None); + return Err("srcset parse error (too many commas)".to_string()); } if remaining.is_empty() { - return ("", None); + return Ok(("", None)); } let (url, remaining) = split_at(remaining, |c| !c.is_whitespace()); let comma_count = url.chars().rev().take_while(|c| *c == ',').count(); if comma_count > 1 { - info!("srcset parse error (trailing commas)"); - return ("", None); + return Err("srcset parse error (trailing commas)".to_string()); } let url = url.get(..url.len() - comma_count); @@ -81,17 +81,23 @@ pub(crate) fn parse(input: &str) -> Vec<&str> { let remaining = skip_descriptor(remaining); - (remaining, url) + Ok((remaining, url)) } let mut candidates: Vec<&str> = Vec::new(); let mut remaining = input; while !remaining.is_empty() { - let (new_remaining, url) = parse_one_url(remaining); - if let Some(url) = url { - candidates.push(url); + remaining = match parse_one_url(remaining) { + Ok((rem, None)) => rem, + Ok((rem, Some(url))) => { + candidates.push(url); + rem + } + Err(e) => { + info!("{e}"); + return vec![]; + } } - remaining = new_remaining; } candidates From 64b4243c6f6bc7a838b4f3478134bf62db6ac785 Mon Sep 17 00:00:00 2001 From: rina Date: Thu, 30 Oct 2025 21:53:39 +1000 Subject: [PATCH 3/5] review: is_ascii_whitespace and use for loop too --- lychee-lib/src/extract/html/srcset.rs | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/lychee-lib/src/extract/html/srcset.rs b/lychee-lib/src/extract/html/srcset.rs index 9d883e6858..cef37d0db4 100644 --- a/lychee-lib/src/extract/html/srcset.rs +++ b/lychee-lib/src/extract/html/srcset.rs @@ -58,7 +58,7 @@ pub(crate) fn parse(input: &str) -> Vec<&str> { /// /// fn parse_one_url(remaining: &str) -> Result<(&str, Option<&str>), String> { - let (start, remaining) = split_at(remaining, |c| *c == ',' || c.is_whitespace()); + let (start, remaining) = split_at(remaining, |c| *c == ',' || c.is_ascii_whitespace()); if start.find(',').is_some() { return Err("srcset parse error (too many commas)".to_string()); @@ -68,7 +68,7 @@ pub(crate) fn parse(input: &str) -> Vec<&str> { return Ok(("", None)); } - let (url, remaining) = split_at(remaining, |c| !c.is_whitespace()); + let (url, remaining) = split_at(remaining, |c| !c.is_ascii_whitespace()); let comma_count = url.chars().rev().take_while(|c| *c == ',').count(); if comma_count > 1 { @@ -77,7 +77,7 @@ pub(crate) fn parse(input: &str) -> Vec<&str> { let url = url.get(..url.len() - comma_count); - let (_spaces, remaining) = split_at(remaining, |c| c.is_whitespace()); + let (_spaces, remaining) = split_at(remaining, |c| c.is_ascii_whitespace()); let remaining = skip_descriptor(remaining); @@ -110,16 +110,12 @@ pub(crate) fn parse(input: &str) -> Vec<&str> { fn skip_descriptor(remaining: &str) -> &str { let mut state = State::InsideDescriptor; - // ASCII whitespace is U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020 SPACE. - // https://infra.spec.whatwg.org/#ascii-whitespace - - let mut it = remaining.chars(); - while let Some(c) = it.next() { + for (i, c) in remaining.char_indices() { match state { State::InsideDescriptor => match c { - ' ' | '\t' | '\n' | '\u{000C}' | '\r' => state = State::AfterDescriptor, + c if c.is_ascii_whitespace() => state = State::AfterDescriptor, '(' => state = State::InsideParens, - ',' => return it.as_str(), // returns string after this comma + ',' => return &remaining[i + c.len_utf8()..], // returns string after this comma _ => (), }, State::InsideParens => match c { @@ -127,7 +123,7 @@ fn skip_descriptor(remaining: &str) -> &str { _ => (), }, State::AfterDescriptor => match c { - ' ' | '\t' | '\n' | '\u{000C}' | '\r' => (), + c if c.is_ascii_whitespace() => (), _ => state = State::InsideDescriptor, }, } From 0c40e8664f3488d4889bf224bb1ad8fc76b90217 Mon Sep 17 00:00:00 2001 From: rina Date: Thu, 30 Oct 2025 21:58:49 +1000 Subject: [PATCH 4/5] clippy --- lychee-lib/src/extract/html/srcset.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lychee-lib/src/extract/html/srcset.rs b/lychee-lib/src/extract/html/srcset.rs index cef37d0db4..9e8777739d 100644 --- a/lychee-lib/src/extract/html/srcset.rs +++ b/lychee-lib/src/extract/html/srcset.rs @@ -77,7 +77,7 @@ pub(crate) fn parse(input: &str) -> Vec<&str> { let url = url.get(..url.len() - comma_count); - let (_spaces, remaining) = split_at(remaining, |c| c.is_ascii_whitespace()); + let (_spaces, remaining) = split_at(remaining, char::is_ascii_whitespace); let remaining = skip_descriptor(remaining); From 97abb8b59fd89f8f8826916e9928d457fd1743a6 Mon Sep 17 00:00:00 2001 From: katrinafyi <39479354+katrinafyi@users.noreply.github.com> Date: Fri, 31 Oct 2025 01:33:00 +1000 Subject: [PATCH 5/5] review: move inner function outside --- lychee-lib/src/extract/html/srcset.rs | 68 +++++++++++++-------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/lychee-lib/src/extract/html/srcset.rs b/lychee-lib/src/extract/html/srcset.rs index 9e8777739d..69b45faa74 100644 --- a/lychee-lib/src/extract/html/srcset.rs +++ b/lychee-lib/src/extract/html/srcset.rs @@ -50,40 +50,6 @@ where // This state-machine is a bit convoluted, but we keep everything in one place // for simplicity so we have to please clippy. pub(crate) fn parse(input: &str) -> Vec<&str> { - /// Implements one iteration of the "splitting loop" from the reference algorithm. - /// This is intended to be repeatedly called until the remaining string is empty. - /// - /// Returns a tuple of remaining string and an optional parsed URL, if successful. - /// Otherwise, in case of srcset syntax errors, returns Err. - /// - /// - fn parse_one_url(remaining: &str) -> Result<(&str, Option<&str>), String> { - let (start, remaining) = split_at(remaining, |c| *c == ',' || c.is_ascii_whitespace()); - - if start.find(',').is_some() { - return Err("srcset parse error (too many commas)".to_string()); - } - - if remaining.is_empty() { - return Ok(("", None)); - } - - let (url, remaining) = split_at(remaining, |c| !c.is_ascii_whitespace()); - - let comma_count = url.chars().rev().take_while(|c| *c == ',').count(); - if comma_count > 1 { - return Err("srcset parse error (trailing commas)".to_string()); - } - - let url = url.get(..url.len() - comma_count); - - let (_spaces, remaining) = split_at(remaining, char::is_ascii_whitespace); - - let remaining = skip_descriptor(remaining); - - Ok((remaining, url)) - } - let mut candidates: Vec<&str> = Vec::new(); let mut remaining = input; while !remaining.is_empty() { @@ -103,6 +69,40 @@ pub(crate) fn parse(input: &str) -> Vec<&str> { candidates } +/// Implements one iteration of the "splitting loop" from the reference algorithm. +/// This is intended to be repeatedly called until the remaining string is empty. +/// +/// Returns a tuple of remaining string and an optional parsed URL, if successful. +/// Otherwise, in case of srcset syntax errors, returns Err. +/// +/// +fn parse_one_url(remaining: &str) -> Result<(&str, Option<&str>), String> { + let (start, remaining) = split_at(remaining, |c| *c == ',' || c.is_ascii_whitespace()); + + if start.find(',').is_some() { + return Err("srcset parse error (too many commas)".to_string()); + } + + if remaining.is_empty() { + return Ok(("", None)); + } + + let (url, remaining) = split_at(remaining, |c| !c.is_ascii_whitespace()); + + let comma_count = url.chars().rev().take_while(|c| *c == ',').count(); + if comma_count > 1 { + return Err("srcset parse error (trailing commas)".to_string()); + } + + let url = url.get(..url.len() - comma_count); + + let (_spaces, remaining) = split_at(remaining, char::is_ascii_whitespace); + + let remaining = skip_descriptor(remaining); + + Ok((remaining, url)) +} + /// Helper function to skip over a descriptor. Returns the string remaining /// after the descriptor (i.e. a string beginning after the next comma or an /// empty string).