Skip to content

Commit

Permalink
feat(parser/html): lex and parse unquoted attribute values
Browse files Browse the repository at this point in the history
  • Loading branch information
dyc3 committed Sep 16, 2024
1 parent 2a775c7 commit fb69e6c
Show file tree
Hide file tree
Showing 7 changed files with 181 additions and 3 deletions.
40 changes: 40 additions & 0 deletions crates/biome_html_parser/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,16 @@ impl<'src> HtmlLexer<'src> {
}
}

fn consume_token_attribute_value(&mut self, current: u8) -> HtmlSyntaxKind {
match current {
b'\n' | b'\r' | b'\t' | b' ' => self.consume_newline_or_whitespaces(),
b'<' => self.consume_byte(T![<]),
b'>' => self.consume_byte(T![>]),
b'\'' | b'"' => self.consume_string_literal(current),
_ => self.consume_unquoted_string_literal(),
}
}

/// Bumps the current byte and creates a lexed token of the passed in kind.
#[inline]
fn consume_byte(&mut self, tok: HtmlSyntaxKind) -> HtmlSyntaxKind {
Expand Down Expand Up @@ -233,6 +243,35 @@ impl<'src> HtmlLexer<'src> {
}
}

/// Consume an attribute value that is not quoted.
///
/// See: https://html.spec.whatwg.org/#attributes-2 under "Unquoted attribute value syntax"
fn consume_unquoted_string_literal(&mut self) -> HtmlSyntaxKind {
let mut content_started = false;
while let Some(current) = self.current_byte() {
match current {
b'\n' | b'\r' | b'\t' | b' ' | b'?' | b'\'' | b'"' | b'=' | b'<' | b'>' | b'`' => {
break
}
_ if current.is_ascii() => {
self.advance(1);
content_started = true;
}
_ => break,
}
}

if content_started {
HTML_STRING_LITERAL
} else {
self.push_diagnostic(ParseDiagnostic::new(
"Unexpected character in unquoted attribute value",
self.text_position()..self.text_position().add(TextSize::from(1)),
));
self.consume_unexpected_character()
}
}

fn consume_l_angle(&mut self) -> HtmlSyntaxKind {
self.assert_byte(b'<');

Expand Down Expand Up @@ -385,6 +424,7 @@ impl<'src> Lexer<'src> for HtmlLexer<'src> {
Some(current) => match context {
HtmlLexContext::Regular => self.consume_token(current),
HtmlLexContext::OutsideTag => self.consume_token_outside_tag(current),
HtmlLexContext::AttributeValue => self.consume_token_attribute_value(current),
},
None => EOF,
}
Expand Down
6 changes: 3 additions & 3 deletions crates/biome_html_parser/src/syntax/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ fn parse_literal(p: &mut HtmlParser) -> ParsedSyntax {
Present(m.complete(p, HTML_NAME))
}

fn parse_string_literal(p: &mut HtmlParser) -> ParsedSyntax {
fn parse_attribute_string_literal(p: &mut HtmlParser) -> ParsedSyntax {
if !p.at(HTML_STRING_LITERAL) {
return Absent;
}
Expand All @@ -226,7 +226,7 @@ fn parse_attribute_initializer(p: &mut HtmlParser) -> ParsedSyntax {
return Absent;
}
let m = p.start();
p.bump(T![=]);
parse_string_literal(p).or_add_diagnostic(p, expected_initializer);
p.bump_with_context(T![=], HtmlLexContext::AttributeValue);
parse_attribute_string_literal(p).or_add_diagnostic(p, expected_initializer);
Present(m.complete(p, HTML_ATTRIBUTE_INITIALIZER_CLAUSE))
}
4 changes: 4 additions & 0 deletions crates/biome_html_parser/src/token_source.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ pub(crate) enum HtmlLexContext {
///
/// The exeptions being `<` which indicates the start of a tag, and `>` which is invalid syntax if not preceeded with a `<`.
OutsideTag,
/// When the parser encounters a `=` token (the beginning of the attribute initializer clause), it switches to this context.
///
/// This is because attribute values can start and end with a `"` or `'` character, or be unquoted, and the lexer needs to know to start lexing a string literal.
AttributeValue,
}

impl LexContext for HtmlLexContext {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<img src=foo.png />
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
---
source: crates/biome_html_parser/tests/spec_test.rs
expression: snapshot
---
## Input

```html
<img src=foo.png />
```


## AST

```
HtmlRoot {
bom_token: missing (optional),
directive: missing (optional),
html: HtmlSelfClosingElement {
l_angle_token: L_ANGLE@0..1 "<" [] [],
name: HtmlName {
value_token: HTML_LITERAL@1..5 "img" [] [Whitespace(" ")],
},
attributes: HtmlAttributeList [
HtmlAttribute {
name: HtmlName {
value_token: HTML_LITERAL@5..8 "src" [] [],
},
initializer: HtmlAttributeInitializerClause {
eq_token: EQ@8..9 "=" [] [],
value: HtmlString {
value_token: HTML_STRING_LITERAL@9..17 "foo.png" [] [Whitespace(" ")],
},
},
},
],
slash_token: SLASH@17..18 "/" [] [],
r_angle_token: R_ANGLE@18..19 ">" [] [],
},
eof_token: EOF@19..20 "" [Newline("\n")] [],
}
```

## CST

```
0: [email protected]
0: (empty)
1: (empty)
2: [email protected]
0: [email protected] "<" [] []
1: [email protected]
0: [email protected] "img" [] [Whitespace(" ")]
2: [email protected]
0: [email protected]
0: [email protected]
0: [email protected] "src" [] []
1: [email protected]
0: [email protected] "=" [] []
1: [email protected]
0: [email protected] "foo.png" [] [Whitespace(" ")]
3: [email protected] "/" [] []
4: [email protected] ">" [] []
3: [email protected] "" [Newline("\n")] []
```
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<img src = foo.png />
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
---
source: crates/biome_html_parser/tests/spec_test.rs
expression: snapshot
---
## Input

```html
<img src = foo.png />
```


## AST

```
HtmlRoot {
bom_token: missing (optional),
directive: missing (optional),
html: HtmlSelfClosingElement {
l_angle_token: L_ANGLE@0..1 "<" [] [],
name: HtmlName {
value_token: HTML_LITERAL@1..5 "img" [] [Whitespace(" ")],
},
attributes: HtmlAttributeList [
HtmlAttribute {
name: HtmlName {
value_token: HTML_LITERAL@5..9 "src" [] [Whitespace(" ")],
},
initializer: HtmlAttributeInitializerClause {
eq_token: EQ@9..11 "=" [] [Whitespace(" ")],
value: HtmlString {
value_token: HTML_STRING_LITERAL@11..19 "foo.png" [] [Whitespace(" ")],
},
},
},
],
slash_token: SLASH@19..20 "/" [] [],
r_angle_token: R_ANGLE@20..21 ">" [] [],
},
eof_token: EOF@21..22 "" [Newline("\n")] [],
}
```

## CST

```
0: [email protected]
0: (empty)
1: (empty)
2: [email protected]
0: [email protected] "<" [] []
1: [email protected]
0: [email protected] "img" [] [Whitespace(" ")]
2: [email protected]
0: [email protected]
0: [email protected]
0: [email protected] "src" [] [Whitespace(" ")]
1: [email protected]
0: [email protected] "=" [] [Whitespace(" ")]
1: [email protected]
0: [email protected] "foo.png" [] [Whitespace(" ")]
3: [email protected] "/" [] []
4: [email protected] ">" [] []
3: [email protected] "" [Newline("\n")] []
```

0 comments on commit fb69e6c

Please sign in to comment.