From f9912830ca6dcb69717549b85f0dff9b3d33b8b8 Mon Sep 17 00:00:00 2001 From: Carson McManus Date: Mon, 16 Sep 2024 17:57:31 -0400 Subject: [PATCH] feat(parser/html): lex and parse unquoted attribute values --- crates/biome_html_parser/src/lexer/mod.rs | 27 ++++++++ crates/biome_html_parser/src/syntax/mod.rs | 2 +- crates/biome_html_parser/src/token_source.rs | 4 ++ .../ok/attributes/attributes-unquoted.html | 1 + .../attributes/attributes-unquoted.html.snap | 66 +++++++++++++++++++ 5 files changed, 99 insertions(+), 1 deletion(-) create mode 100644 crates/biome_html_parser/tests/html_specs/ok/attributes/attributes-unquoted.html create mode 100644 crates/biome_html_parser/tests/html_specs/ok/attributes/attributes-unquoted.html.snap diff --git a/crates/biome_html_parser/src/lexer/mod.rs b/crates/biome_html_parser/src/lexer/mod.rs index 4c799f15aa6c..e849bc4e3cf7 100644 --- a/crates/biome_html_parser/src/lexer/mod.rs +++ b/crates/biome_html_parser/src/lexer/mod.rs @@ -88,6 +88,15 @@ impl<'src> HtmlLexer<'src> { } } + fn consume_token_attribute_value(&mut self, current: u8) -> HtmlSyntaxKind { + match current { + b'\n' | b'\r' | b'\t' | b' ' => self.consume_newline_or_whitespaces(), + + b'\'' | b'"' => self.consume_string_literal(current), + _ => self.consume_unquoted_string_literal(), + } + } + /// Bumps the current byte and creates a lexed token of the passed in kind. #[inline] fn consume_byte(&mut self, tok: HtmlSyntaxKind) -> HtmlSyntaxKind { @@ -233,6 +242,23 @@ impl<'src> HtmlLexer<'src> { } } + /// Consume an attribute value that is not quoted. + /// + /// See: https://html.spec.whatwg.org/#attributes-2 under "Unquoted attribute value syntax" + fn consume_unquoted_string_literal(&mut self) -> HtmlSyntaxKind { + while let Some(current) = self.current_byte() { + match current { + b'\n' | b'\r' | b'\t' | b' ' | b'?' | b'\'' | b'"' | b'=' | b'<' | b'>' | b'`' => { + break + } + _ if current.is_ascii() => self.advance(1), + _ => break, + } + } + + HTML_STRING_LITERAL + } + fn consume_l_angle(&mut self) -> HtmlSyntaxKind { self.assert_byte(b'<'); @@ -385,6 +411,7 @@ impl<'src> Lexer<'src> for HtmlLexer<'src> { Some(current) => match context { HtmlLexContext::Regular => self.consume_token(current), HtmlLexContext::OutsideTag => self.consume_token_outside_tag(current), + HtmlLexContext::AttributeValue => self.consume_token_attribute_value(current), }, None => EOF, } diff --git a/crates/biome_html_parser/src/syntax/mod.rs b/crates/biome_html_parser/src/syntax/mod.rs index dc776b8c1a62..5cc77c0ab634 100644 --- a/crates/biome_html_parser/src/syntax/mod.rs +++ b/crates/biome_html_parser/src/syntax/mod.rs @@ -226,7 +226,7 @@ fn parse_attribute_initializer(p: &mut HtmlParser) -> ParsedSyntax { return Absent; } let m = p.start(); - p.bump(T![=]); + p.bump_with_context(T![=], HtmlLexContext::AttributeValue); parse_string_literal(p).or_add_diagnostic(p, expected_initializer); Present(m.complete(p, HTML_ATTRIBUTE_INITIALIZER_CLAUSE)) } diff --git a/crates/biome_html_parser/src/token_source.rs b/crates/biome_html_parser/src/token_source.rs index 9c8d6b809ad9..336f0b8ccd6a 100644 --- a/crates/biome_html_parser/src/token_source.rs +++ b/crates/biome_html_parser/src/token_source.rs @@ -23,6 +23,10 @@ pub(crate) enum HtmlLexContext { /// /// The exeptions being `<` which indicates the start of a tag, and `>` which is invalid syntax if not preceeded with a `<`. OutsideTag, + /// When the parser encounters a `=` token (the beginning of the attribute initializer clause), it switches to this context. + /// + /// This is because attribute values can start and end with a `"` or `'` character, or be unquoted, and the lexer needs to know to start lexing a string literal. + AttributeValue, } impl LexContext for HtmlLexContext { diff --git a/crates/biome_html_parser/tests/html_specs/ok/attributes/attributes-unquoted.html b/crates/biome_html_parser/tests/html_specs/ok/attributes/attributes-unquoted.html new file mode 100644 index 000000000000..1b828bd8f673 --- /dev/null +++ b/crates/biome_html_parser/tests/html_specs/ok/attributes/attributes-unquoted.html @@ -0,0 +1 @@ + diff --git a/crates/biome_html_parser/tests/html_specs/ok/attributes/attributes-unquoted.html.snap b/crates/biome_html_parser/tests/html_specs/ok/attributes/attributes-unquoted.html.snap new file mode 100644 index 000000000000..8c0368081c8b --- /dev/null +++ b/crates/biome_html_parser/tests/html_specs/ok/attributes/attributes-unquoted.html.snap @@ -0,0 +1,66 @@ +--- +source: crates/biome_html_parser/tests/spec_test.rs +expression: snapshot +--- +## Input + +```html + + +``` + + +## AST + +``` +HtmlRoot { + bom_token: missing (optional), + directive: missing (optional), + html: HtmlSelfClosingElement { + l_angle_token: L_ANGLE@0..1 "<" [] [], + name: HtmlName { + value_token: HTML_LITERAL@1..5 "img" [] [Whitespace(" ")], + }, + attributes: HtmlAttributeList [ + HtmlAttribute { + name: HtmlName { + value_token: HTML_LITERAL@5..8 "src" [] [], + }, + initializer: HtmlAttributeInitializerClause { + eq_token: EQ@8..9 "=" [] [], + value: HtmlString { + value_token: HTML_STRING_LITERAL@9..17 "foo.png" [] [Whitespace(" ")], + }, + }, + }, + ], + slash_token: SLASH@17..18 "/" [] [], + r_angle_token: R_ANGLE@18..19 ">" [] [], + }, + eof_token: EOF@19..20 "" [Newline("\n")] [], +} +``` + +## CST + +``` +0: HTML_ROOT@0..20 + 0: (empty) + 1: (empty) + 2: HTML_SELF_CLOSING_ELEMENT@0..19 + 0: L_ANGLE@0..1 "<" [] [] + 1: HTML_NAME@1..5 + 0: HTML_LITERAL@1..5 "img" [] [Whitespace(" ")] + 2: HTML_ATTRIBUTE_LIST@5..17 + 0: HTML_ATTRIBUTE@5..17 + 0: HTML_NAME@5..8 + 0: HTML_LITERAL@5..8 "src" [] [] + 1: HTML_ATTRIBUTE_INITIALIZER_CLAUSE@8..17 + 0: EQ@8..9 "=" [] [] + 1: HTML_STRING@9..17 + 0: HTML_STRING_LITERAL@9..17 "foo.png" [] [Whitespace(" ")] + 3: SLASH@17..18 "/" [] [] + 4: R_ANGLE@18..19 ">" [] [] + 3: EOF@19..20 "" [Newline("\n")] [] + +```