Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 78 additions & 50 deletions lychee-lib/src/extract/html/srcset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
//! use a state machine to keep track of the current state.

use log::info;
use std::result::Result;

enum State {
InsideDescriptor,
Expand All @@ -29,6 +30,9 @@ enum State {

/// Split an input string at the first character for which
/// the predicate returns false.
///
/// In other words, returns the longest prefix span where `predicate` is
/// satisfied, along with the rest of the string.
fn split_at<F>(input: &str, predicate: F) -> (&str, &str)
where
F: Fn(&char) -> bool,
Expand All @@ -47,77 +51,85 @@ where
// for simplicity so we have to please clippy.
pub(crate) fn parse(input: &str) -> Vec<&str> {
let mut candidates: Vec<&str> = Vec::new();
let mut index = 0;

while index < input.len() {
let position = &input[index..];
let (start, remaining) = split_at(position, |c| *c == ',' || c.is_whitespace());

if start.find(',').is_some() {
info!("srcset parse Error");
return vec![];
let mut remaining = input;
while !remaining.is_empty() {
remaining = match parse_one_url(remaining) {
Ok((rem, None)) => rem,
Ok((rem, Some(url))) => {
candidates.push(url);
rem
}
Err(e) => {
info!("{e}");
return vec![];
}
}
index += start.chars().count();
}

if remaining.is_empty() {
return candidates;
}
candidates
}

let (url, remaining) = split_at(remaining, |c| !c.is_whitespace());
index += url.chars().count();
/// Implements one iteration of the "splitting loop" from the reference algorithm.
/// This is intended to be repeatedly called until the remaining string is empty.
///
/// Returns a tuple of remaining string and an optional parsed URL, if successful.
/// Otherwise, in case of srcset syntax errors, returns Err.
///
/// <https://html.spec.whatwg.org/multipage/images.html#parsing-a-srcset-attribute>
fn parse_one_url(remaining: &str) -> Result<(&str, Option<&str>), String> {
let (start, remaining) = split_at(remaining, |c| *c == ',' || c.is_ascii_whitespace());

if start.find(',').is_some() {
return Err("srcset parse error (too many commas)".to_string());
}

let comma_count = url.chars().rev().take_while(|c| *c == ',').count();
if remaining.is_empty() {
return Ok(("", None));
}

if let Some(url) = url.get(..url.len() - comma_count) {
candidates.push(url);
}
let (url, remaining) = split_at(remaining, |c| !c.is_ascii_whitespace());

if comma_count > 1 {
info!("srcset parse error (trailing commas)");
return vec![];
}
let comma_count = url.chars().rev().take_while(|c| *c == ',').count();
if comma_count > 1 {
return Err("srcset parse error (trailing commas)".to_string());
}

index += 1;
let url = url.get(..url.len() - comma_count);

let (space, remaining) = split_at(remaining, |c| c.is_whitespace());
index += space.len();
let (_spaces, remaining) = split_at(remaining, char::is_ascii_whitespace);

index = skip_descriptor(index, remaining);
}
let remaining = skip_descriptor(remaining);

candidates
Ok((remaining, url))
}

/// Helper function to skip over a descriptor.
/// Returns the index of the next character after the descriptor
/// (i.e. pointing at the comma or the end of the string)
fn skip_descriptor(mut index: usize, remaining: &str) -> usize {
/// Helper function to skip over a descriptor. Returns the string remaining
/// after the descriptor (i.e. a string beginning after the next comma or an
/// empty string).
#[allow(clippy::single_match)]
fn skip_descriptor(remaining: &str) -> &str {
let mut state = State::InsideDescriptor;

for c in remaining.chars() {
index += 1;

for (i, c) in remaining.char_indices() {
match state {
State::InsideDescriptor => match c {
' ' => state = State::AfterDescriptor,
c if c.is_ascii_whitespace() => state = State::AfterDescriptor,
'(' => state = State::InsideParens,
',' => return index,
_ => {}
',' => return &remaining[i + c.len_utf8()..], // returns string after this comma
_ => (),
},
State::InsideParens => match c {
')' => state = State::InsideDescriptor,
_ => (),
},
State::AfterDescriptor => match c {
c if c.is_ascii_whitespace() => (),
_ => state = State::InsideDescriptor,
},
State::InsideParens => {
if c == ')' {
state = State::InsideDescriptor;
}
}
State::AfterDescriptor => {
if c != ' ' {
state = State::InsideDescriptor;
}
}
}
}

index
""
}

#[cfg(test)]
Expand Down Expand Up @@ -217,4 +229,20 @@ mod tests {
]
);
}

#[test]
fn test_parse_srcset_without_spaces() {
assert_eq!(
parse(
"/300.png 300w,/600.png 600w,/900.png 900w,https://x.invalid/a.png 1000w,relative.png 10w"
),
vec![
"/300.png",
"/600.png",
"/900.png",
"https://x.invalid/a.png",
"relative.png"
]
);
}
}