feat(parser/html): parse doctype declarations

biomejs · Sep 18, 2024 · 17f704e · 17f704e
1 parent a7b623a
commit 17f704e
Show file tree

Hide file tree

Showing 14 changed files with 304 additions and 16 deletions.
diff --git a/crates/biome_html_parser/src/lexer/mod.rs b/crates/biome_html_parser/src/lexer/mod.rs
@@ -33,8 +33,6 @@ pub(crate) struct HtmlLexer<'src> {
     after_newline: bool,
 
     unicode_bom_length: usize,
-
-    after_doctype: bool,
 }
 
 impl<'src> HtmlLexer<'src> {
@@ -49,24 +47,22 @@ impl<'src> HtmlLexer<'src> {
             after_newline: false,
             current_flags: TokenFlags::empty(),
             unicode_bom_length: 0,
-            after_doctype: false,
         }
     }
+
+    /// Consume a token in the [HtmlLexContext::Regular] context.
     fn consume_token(&mut self, current: u8) -> HtmlSyntaxKind {
         match current {
             b'\n' | b'\r' | b'\t' | b' ' => self.consume_newline_or_whitespaces(),
             b'<' => self.consume_l_angle(),
-            b'>' => {
-                self.after_doctype = false;
-                self.consume_byte(T![>])
-            }
+            b'>' => self.consume_byte(T![>]),
             b'/' => self.consume_byte(T![/]),
             b'=' => self.consume_byte(T![=]),
             b'!' => self.consume_byte(T![!]),
             b'\'' | b'"' => self.consume_string_literal(current),
             // TODO: differentiate between attribute names and identifiers
             _ if is_identifier_byte(current) || is_attribute_name_byte(current) => {
-                self.consume_identifier(current)
+                self.consume_identifier(current, false)
             }
             _ => {
                 if self.position == 0 {
@@ -80,6 +76,7 @@ impl<'src> HtmlLexer<'src> {
         }
     }
 
+    /// Consume a token in the [HtmlLexContext::OutsideTag] context.
     fn consume_token_outside_tag(&mut self, current: u8) -> HtmlSyntaxKind {
         match current {
             b'\n' | b'\r' | b'\t' | b' ' => self.consume_newline_or_whitespaces(),
@@ -88,6 +85,7 @@ impl<'src> HtmlLexer<'src> {
         }
     }
 
+    /// Consume a token in the [HtmlLexContext::AttributeValue] context.
     fn consume_token_attribute_value(&mut self, current: u8) -> HtmlSyntaxKind {
         match current {
             b'\n' | b'\r' | b'\t' | b' ' => self.consume_newline_or_whitespaces(),
@@ -98,6 +96,21 @@ impl<'src> HtmlLexer<'src> {
         }
     }
 
+    /// Consume a token in the [HtmlLexContext::Doctype] context.
+    fn consume_token_doctype(&mut self, current: u8) -> HtmlSyntaxKind {
+        match current {
+            b'\n' | b'\r' | b'\t' | b' ' => self.consume_newline_or_whitespaces(),
+            b'<' => self.consume_byte(T![<]),
+            b'>' => self.consume_byte(T![>]),
+            b'!' => self.consume_byte(T![!]),
+            b'\'' | b'"' => self.consume_string_literal(current),
+            _ if is_identifier_byte(current) || is_attribute_name_byte(current) => {
+                self.consume_identifier(current, true)
+            }
+            _ => self.consume_unexpected_character(),
+        }
+    }
+
     /// Bumps the current byte and creates a lexed token of the passed in kind.
     #[inline]
     fn consume_byte(&mut self, tok: HtmlSyntaxKind) -> HtmlSyntaxKind {
@@ -125,7 +138,7 @@ impl<'src> HtmlLexer<'src> {
         debug_assert!(self.source.is_char_boundary(self.position));
     }
 
-    fn consume_identifier(&mut self, first: u8) -> HtmlSyntaxKind {
+    fn consume_identifier(&mut self, first: u8, doctype_context: bool) -> HtmlSyntaxKind {
         self.assert_current_char_boundary();
 
         const BUFFER_SIZE: usize = 14;
@@ -149,11 +162,8 @@ impl<'src> HtmlLexer<'src> {
         }
 
         match &buffer[..len] {
-            b"doctype" | b"DOCTYPE" => {
-                self.after_doctype = true;
-                DOCTYPE_KW
-            }
-            b"html" | b"HTML" if self.after_doctype => HTML_KW,
+            b"doctype" | b"DOCTYPE" => DOCTYPE_KW,
+            b"html" | b"HTML" if doctype_context => HTML_KW,
             _ => HTML_LITERAL,
         }
     }
@@ -431,6 +441,7 @@ impl<'src> Lexer<'src> for HtmlLexer<'src> {
                     HtmlLexContext::Regular => self.consume_token(current),
                     HtmlLexContext::OutsideTag => self.consume_token_outside_tag(current),
                     HtmlLexContext::AttributeValue => self.consume_token_attribute_value(current),
+                    HtmlLexContext::Doctype => self.consume_token_doctype(current),
                 },
                 None => EOF,
             }

diff --git a/crates/biome_html_parser/src/lexer/tests.rs b/crates/biome_html_parser/src/lexer/tests.rs
@@ -105,6 +105,7 @@ macro_rules! assert_lex {
 #[test]
 fn doctype_key() {
     assert_lex! {
+        HtmlLexContext::Doctype,
         "doctype",
         DOCTYPE_KW: 7,
     }
@@ -113,6 +114,7 @@ fn doctype_key() {
 #[test]
 fn doctype_upper_key() {
     assert_lex! {
+            HtmlLexContext::Doctype,
         "DOCTYPE",
         DOCTYPE_KW: 7,
     }
@@ -164,6 +166,7 @@ fn html_text() {
 #[test]
 fn doctype_with_quirk() {
     assert_lex! {
+        HtmlLexContext::Doctype,
         "<!DOCTYPE HTML>",
         L_ANGLE: 1,
         BANG: 1,
@@ -177,6 +180,7 @@ fn doctype_with_quirk() {
 #[test]
 fn doctype_with_quirk_and_system() {
     assert_lex! {
+        HtmlLexContext::Doctype,
         "<!DOCTYPE HTML \"+//silmaril//dtd html pro v0r11 19970101//\">",
         L_ANGLE: 1,
         BANG: 1,

diff --git a/crates/biome_html_parser/src/syntax/mod.rs b/crates/biome_html_parser/src/syntax/mod.rs
@@ -41,11 +41,23 @@ fn parse_doc_type(p: &mut HtmlParser) -> ParsedSyntax {
     p.bump(T![!]);
 
     if p.at(T![doctype]) {
-        p.eat(T![doctype]);
+        p.eat_with_context(T![doctype], HtmlLexContext::Doctype);
     }
 
     if p.at(T![html]) {
-        p.eat(T![html]);
+        p.eat_with_context(T![html], HtmlLexContext::Doctype);
+    }
+
+    if p.at(HTML_LITERAL) {
+        p.eat_with_context(HTML_LITERAL, HtmlLexContext::Doctype);
+    }
+
+    if p.at(HTML_STRING_LITERAL) {
+        p.eat_with_context(HTML_STRING_LITERAL, HtmlLexContext::Doctype);
+    }
+
+    if p.at(HTML_STRING_LITERAL) {
+        p.eat_with_context(HTML_STRING_LITERAL, HtmlLexContext::Doctype);
     }
 
     p.eat(T![>]);

diff --git a/crates/biome_html_parser/src/token_source.rs b/crates/biome_html_parser/src/token_source.rs
@@ -27,6 +27,10 @@ pub(crate) enum HtmlLexContext {
     ///
     /// This is because attribute values can start and end with a `"` or `'` character, or be unquoted, and the lexer needs to know to start lexing a string literal.
     AttributeValue,
+    /// Enables the `html` keyword token.
+    ///
+    /// When the parser has encounters the sequence `<!DOCTYPE`, it switches to this context. It will remain in this context until the next `>` token is encountered.
+    Doctype,
 }
 
 impl LexContext for HtmlLexContext {

diff --git a/crates/biome_html_parser/tests/html_specs/ok/doctype/basic.html b/crates/biome_html_parser/tests/html_specs/ok/doctype/basic.html
@@ -0,0 +1 @@
+<!DOCTYPE html>
diff --git a/crates/biome_html_parser/tests/html_specs/ok/doctype/basic.html.snap b/crates/biome_html_parser/tests/html_specs/ok/doctype/basic.html.snap
@@ -0,0 +1,50 @@
+---
+source: crates/biome_html_parser/tests/spec_test.rs
+expression: snapshot
+---
+## Input
+
+```html
+<!DOCTYPE html>
+
+```
+
+
+## AST
+
+```
+HtmlRoot {
+    bom_token: missing (optional),
+    directive: HtmlDirective {
+        l_angle_token: L_ANGLE@0..1 "<" [] [],
+        excl_token: BANG@1..2 "!" [] [],
+        doctype_token: DOCTYPE_KW@2..10 "DOCTYPE" [] [Whitespace(" ")],
+        html_token: HTML_KW@10..14 "html" [] [],
+        quirk_token: missing (optional),
+        public_id_token: missing (optional),
+        system_id_token: missing (optional),
+        r_angle_token: R_ANGLE@14..15 ">" [] [],
+    },
+    html: missing (optional),
+    eof_token: EOF@15..16 "" [Newline("\n")] [],
+}
+```
+
+## CST
+
+```
+0: [email protected]
+  0: (empty)
+  1: [email protected]
+    0: [email protected] "<" [] []
+    1: [email protected] "!" [] []
+    2: [email protected] "DOCTYPE" [] [Whitespace(" ")]
+    3: [email protected] "html" [] []
+    4: (empty)
+    5: (empty)
+    6: (empty)
+    7: [email protected] ">" [] []
+  2: (empty)
+  3: [email protected] "" [Newline("\n")] []
+
+```
diff --git a/crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy1.html b/crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy1.html
@@ -0,0 +1 @@
+<!DOCTYPE html SYSTEM "about:legacy-compat">
diff --git a/crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy1.html.snap b/crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy1.html.snap
@@ -0,0 +1,50 @@
+---
+source: crates/biome_html_parser/tests/spec_test.rs
+expression: snapshot
+---
+## Input
+
+```html
+<!DOCTYPE html SYSTEM "about:legacy-compat">
+
+```
+
+
+## AST
+
+```
+HtmlRoot {
+    bom_token: missing (optional),
+    directive: HtmlDirective {
+        l_angle_token: L_ANGLE@0..1 "<" [] [],
+        excl_token: BANG@1..2 "!" [] [],
+        doctype_token: DOCTYPE_KW@2..10 "DOCTYPE" [] [Whitespace(" ")],
+        html_token: HTML_KW@10..15 "html" [] [Whitespace(" ")],
+        quirk_token: HTML_LITERAL@15..22 "SYSTEM" [] [Whitespace(" ")],
+        public_id_token: HTML_STRING_LITERAL@22..43 "\"about:legacy-compat\"" [] [],
+        system_id_token: missing (optional),
+        r_angle_token: R_ANGLE@43..44 ">" [] [],
+    },
+    html: missing (optional),
+    eof_token: EOF@44..45 "" [Newline("\n")] [],
+}
+```
+
+## CST
+
+```
+0: [email protected]
+  0: (empty)
+  1: [email protected]
+    0: [email protected] "<" [] []
+    1: [email protected] "!" [] []
+    2: [email protected] "DOCTYPE" [] [Whitespace(" ")]
+    3: [email protected] "html" [] [Whitespace(" ")]
+    4: [email protected] "SYSTEM" [] [Whitespace(" ")]
+    5: [email protected] "\"about:legacy-compat\"" [] []
+    6: (empty)
+    7: [email protected] ">" [] []
+  2: (empty)
+  3: [email protected] "" [Newline("\n")] []
+
+```
diff --git a/crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy2.html b/crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy2.html
@@ -0,0 +1 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
diff --git a/crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy2.html.snap b/crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy2.html.snap
@@ -0,0 +1,50 @@
+---
+source: crates/biome_html_parser/tests/spec_test.rs
+expression: snapshot
+---
+## Input
+
+```html
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
+
+```
+
+
+## AST
+
+```
+HtmlRoot {
+    bom_token: missing (optional),
+    directive: HtmlDirective {
+        l_angle_token: L_ANGLE@0..1 "<" [] [],
+        excl_token: BANG@1..2 "!" [] [],
+        doctype_token: DOCTYPE_KW@2..10 "DOCTYPE" [] [Whitespace(" ")],
+        html_token: HTML_KW@10..15 "html" [] [Whitespace(" ")],
+        quirk_token: HTML_LITERAL@15..22 "PUBLIC" [] [Whitespace(" ")],
+        public_id_token: HTML_STRING_LITERAL@22..49 "\"-//W3C//DTD HTML 4.01//EN\"" [] [],
+        system_id_token: missing (optional),
+        r_angle_token: R_ANGLE@49..50 ">" [] [],
+    },
+    html: missing (optional),
+    eof_token: EOF@50..51 "" [Newline("\n")] [],
+}
+```
+
+## CST
+
+```
+0: [email protected]
+  0: (empty)
+  1: [email protected]
+    0: [email protected] "<" [] []
+    1: [email protected] "!" [] []
+    2: [email protected] "DOCTYPE" [] [Whitespace(" ")]
+    3: [email protected] "html" [] [Whitespace(" ")]
+    4: [email protected] "PUBLIC" [] [Whitespace(" ")]
+    5: [email protected] "\"-//W3C//DTD HTML 4.01//EN\"" [] []
+    6: (empty)
+    7: [email protected] ">" [] []
+  2: (empty)
+  3: [email protected] "" [Newline("\n")] []
+
+```
diff --git a/crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy3.html b/crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy3.html
@@ -0,0 +1,2 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Frameset//EN"
+  "http://www.w3.org/TR/html4/frameset.dtd">
diff --git a/crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy3.html.snap b/crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy3.html.snap
@@ -0,0 +1,51 @@
+---
+source: crates/biome_html_parser/tests/spec_test.rs
+expression: snapshot
+---
+## Input
+
+```html
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Frameset//EN"
+  "http://www.w3.org/TR/html4/frameset.dtd">
+
+```
+
+
+## AST
+
+```
+HtmlRoot {
+    bom_token: missing (optional),
+    directive: HtmlDirective {
+        l_angle_token: L_ANGLE@0..1 "<" [] [],
+        excl_token: BANG@1..2 "!" [] [],
+        doctype_token: DOCTYPE_KW@2..10 "DOCTYPE" [] [Whitespace(" ")],
+        html_token: HTML_KW@10..15 "HTML" [] [Whitespace(" ")],
+        quirk_token: HTML_LITERAL@15..22 "PUBLIC" [] [Whitespace(" ")],
+        public_id_token: HTML_STRING_LITERAL@22..58 "\"-//W3C//DTD HTML 4.01 Frameset//EN\"" [] [],
+        system_id_token: HTML_STRING_LITERAL@58..102 "\"http://www.w3.org/TR/html4/frameset.dtd\"" [Newline("\n"), Whitespace("  ")] [],
+        r_angle_token: R_ANGLE@102..103 ">" [] [],
+    },
+    html: missing (optional),
+    eof_token: EOF@103..104 "" [Newline("\n")] [],
+}
+```
+
+## CST
+
+```
+0: [email protected]
+  0: (empty)
+  1: [email protected]
+    0: [email protected] "<" [] []
+    1: [email protected] "!" [] []
+    2: [email protected] "DOCTYPE" [] [Whitespace(" ")]
+    3: [email protected] "HTML" [] [Whitespace(" ")]
+    4: [email protected] "PUBLIC" [] [Whitespace(" ")]
+    5: [email protected] "\"-//W3C//DTD HTML 4.01 Frameset//EN\"" [] []
+    6: [email protected] "\"http://www.w3.org/TR/html4/frameset.dtd\"" [Newline("\n"), Whitespace("  ")] []
+    7: [email protected] ">" [] []
+  2: (empty)
+  3: [email protected] "" [Newline("\n")] []
+
+```
diff --git a/crates/biome_html_parser/tests/html_specs/ok/doctype/minimal.html b/crates/biome_html_parser/tests/html_specs/ok/doctype/minimal.html
@@ -0,0 +1 @@
+<!DOCTYPE>