feat(parser/html): lex and parse unquoted attribute values

biomejs · Sep 16, 2024 · fb69e6c · fb69e6c
1 parent 2a775c7
commit fb69e6c
Show file tree

Hide file tree

Showing 7 changed files with 181 additions and 3 deletions.
diff --git a/crates/biome_html_parser/src/lexer/mod.rs b/crates/biome_html_parser/src/lexer/mod.rs
@@ -88,6 +88,16 @@ impl<'src> HtmlLexer<'src> {
         }
     }
 
+    fn consume_token_attribute_value(&mut self, current: u8) -> HtmlSyntaxKind {
+        match current {
+            b'\n' | b'\r' | b'\t' | b' ' => self.consume_newline_or_whitespaces(),
+            b'<' => self.consume_byte(T![<]),
+            b'>' => self.consume_byte(T![>]),
+            b'\'' | b'"' => self.consume_string_literal(current),
+            _ => self.consume_unquoted_string_literal(),
+        }
+    }
+
     /// Bumps the current byte and creates a lexed token of the passed in kind.
     #[inline]
     fn consume_byte(&mut self, tok: HtmlSyntaxKind) -> HtmlSyntaxKind {
@@ -233,6 +243,35 @@ impl<'src> HtmlLexer<'src> {
         }
     }
 
+    /// Consume an attribute value that is not quoted.
+    ///
+    /// See: https://html.spec.whatwg.org/#attributes-2 under "Unquoted attribute value syntax"
+    fn consume_unquoted_string_literal(&mut self) -> HtmlSyntaxKind {
+        let mut content_started = false;
+        while let Some(current) = self.current_byte() {
+            match current {
+                b'\n' | b'\r' | b'\t' | b' ' | b'?' | b'\'' | b'"' | b'=' | b'<' | b'>' | b'`' => {
+                    break
+                }
+                _ if current.is_ascii() => {
+                    self.advance(1);
+                    content_started = true;
+                }
+                _ => break,
+            }
+        }
+
+        if content_started {
+            HTML_STRING_LITERAL
+        } else {
+            self.push_diagnostic(ParseDiagnostic::new(
+                "Unexpected character in unquoted attribute value",
+                self.text_position()..self.text_position().add(TextSize::from(1)),
+            ));
+            self.consume_unexpected_character()
+        }
+    }
+
     fn consume_l_angle(&mut self) -> HtmlSyntaxKind {
         self.assert_byte(b'<');
 
@@ -385,6 +424,7 @@ impl<'src> Lexer<'src> for HtmlLexer<'src> {
                 Some(current) => match context {
                     HtmlLexContext::Regular => self.consume_token(current),
                     HtmlLexContext::OutsideTag => self.consume_token_outside_tag(current),
+                    HtmlLexContext::AttributeValue => self.consume_token_attribute_value(current),
                 },
                 None => EOF,
             }

diff --git a/crates/biome_html_parser/src/syntax/mod.rs b/crates/biome_html_parser/src/syntax/mod.rs
@@ -210,7 +210,7 @@ fn parse_literal(p: &mut HtmlParser) -> ParsedSyntax {
     Present(m.complete(p, HTML_NAME))
 }
 
-fn parse_string_literal(p: &mut HtmlParser) -> ParsedSyntax {
+fn parse_attribute_string_literal(p: &mut HtmlParser) -> ParsedSyntax {
     if !p.at(HTML_STRING_LITERAL) {
         return Absent;
     }
@@ -226,7 +226,7 @@ fn parse_attribute_initializer(p: &mut HtmlParser) -> ParsedSyntax {
         return Absent;
     }
     let m = p.start();
-    p.bump(T![=]);
-    parse_string_literal(p).or_add_diagnostic(p, expected_initializer);
+    p.bump_with_context(T![=], HtmlLexContext::AttributeValue);
+    parse_attribute_string_literal(p).or_add_diagnostic(p, expected_initializer);
     Present(m.complete(p, HTML_ATTRIBUTE_INITIALIZER_CLAUSE))
 }
diff --git a/crates/biome_html_parser/src/token_source.rs b/crates/biome_html_parser/src/token_source.rs
@@ -23,6 +23,10 @@ pub(crate) enum HtmlLexContext {
     ///
     /// The exeptions being `<` which indicates the start of a tag, and `>` which is invalid syntax if not preceeded with a `<`.
     OutsideTag,
+    /// When the parser encounters a `=` token (the beginning of the attribute initializer clause), it switches to this context.
+    ///
+    /// This is because attribute values can start and end with a `"` or `'` character, or be unquoted, and the lexer needs to know to start lexing a string literal.
+    AttributeValue,
 }
 
 impl LexContext for HtmlLexContext {

diff --git a/crates/biome_html_parser/tests/html_specs/ok/attributes/attributes-unquoted.html b/crates/biome_html_parser/tests/html_specs/ok/attributes/attributes-unquoted.html
@@ -0,0 +1 @@
+<img src=foo.png />
diff --git a/crates/biome_html_parser/tests/html_specs/ok/attributes/attributes-unquoted.html.snap b/crates/biome_html_parser/tests/html_specs/ok/attributes/attributes-unquoted.html.snap
@@ -0,0 +1,66 @@
+---
+source: crates/biome_html_parser/tests/spec_test.rs
+expression: snapshot
+---
+## Input
+
+```html
+<img src=foo.png />
+
+```
+
+
+## AST
+
+```
+HtmlRoot {
+    bom_token: missing (optional),
+    directive: missing (optional),
+    html: HtmlSelfClosingElement {
+        l_angle_token: L_ANGLE@0..1 "<" [] [],
+        name: HtmlName {
+            value_token: HTML_LITERAL@1..5 "img" [] [Whitespace(" ")],
+        },
+        attributes: HtmlAttributeList [
+            HtmlAttribute {
+                name: HtmlName {
+                    value_token: HTML_LITERAL@5..8 "src" [] [],
+                },
+                initializer: HtmlAttributeInitializerClause {
+                    eq_token: EQ@8..9 "=" [] [],
+                    value: HtmlString {
+                        value_token: HTML_STRING_LITERAL@9..17 "foo.png" [] [Whitespace(" ")],
+                    },
+                },
+            },
+        ],
+        slash_token: SLASH@17..18 "/" [] [],
+        r_angle_token: R_ANGLE@18..19 ">" [] [],
+    },
+    eof_token: EOF@19..20 "" [Newline("\n")] [],
+}
+```
+
+## CST
+
+```
+0: [email protected]
+  0: (empty)
+  1: (empty)
+  2: [email protected]
+    0: [email protected] "<" [] []
+    1: [email protected]
+      0: [email protected] "img" [] [Whitespace(" ")]
+    2: [email protected]
+      0: [email protected]
+        0: [email protected]
+          0: [email protected] "src" [] []
+        1: [email protected]
+          0: [email protected] "=" [] []
+          1: [email protected]
+            0: [email protected] "foo.png" [] [Whitespace(" ")]
+    3: [email protected] "/" [] []
+    4: [email protected] ">" [] []
+  3: [email protected] "" [Newline("\n")] []
+
+```
diff --git a/crates/biome_html_parser/tests/html_specs/ok/attributes/attributes-unquoted2.html b/crates/biome_html_parser/tests/html_specs/ok/attributes/attributes-unquoted2.html
@@ -0,0 +1 @@
+<img src = foo.png />
diff --git a/crates/biome_html_parser/tests/html_specs/ok/attributes/attributes-unquoted2.html.snap b/crates/biome_html_parser/tests/html_specs/ok/attributes/attributes-unquoted2.html.snap
@@ -0,0 +1,66 @@
+---
+source: crates/biome_html_parser/tests/spec_test.rs
+expression: snapshot
+---
+## Input
+
+```html
+<img src = foo.png />
+
+```
+
+
+## AST
+
+```
+HtmlRoot {
+    bom_token: missing (optional),
+    directive: missing (optional),
+    html: HtmlSelfClosingElement {
+        l_angle_token: L_ANGLE@0..1 "<" [] [],
+        name: HtmlName {
+            value_token: HTML_LITERAL@1..5 "img" [] [Whitespace(" ")],
+        },
+        attributes: HtmlAttributeList [
+            HtmlAttribute {
+                name: HtmlName {
+                    value_token: HTML_LITERAL@5..9 "src" [] [Whitespace(" ")],
+                },
+                initializer: HtmlAttributeInitializerClause {
+                    eq_token: EQ@9..11 "=" [] [Whitespace(" ")],
+                    value: HtmlString {
+                        value_token: HTML_STRING_LITERAL@11..19 "foo.png" [] [Whitespace(" ")],
+                    },
+                },
+            },
+        ],
+        slash_token: SLASH@19..20 "/" [] [],
+        r_angle_token: R_ANGLE@20..21 ">" [] [],
+    },
+    eof_token: EOF@21..22 "" [Newline("\n")] [],
+}
+```
+
+## CST
+
+```
+0: [email protected]
+  0: (empty)
+  1: (empty)
+  2: [email protected]
+    0: [email protected] "<" [] []
+    1: [email protected]
+      0: [email protected] "img" [] [Whitespace(" ")]
+    2: [email protected]
+      0: [email protected]
+        0: [email protected]
+          0: [email protected] "src" [] [Whitespace(" ")]
+        1: [email protected]
+          0: [email protected] "=" [] [Whitespace(" ")]
+          1: [email protected]
+            0: [email protected] "foo.png" [] [Whitespace(" ")]
+    3: [email protected] "/" [] []
+    4: [email protected] ">" [] []
+  3: [email protected] "" [Newline("\n")] []
+
+```