From 17f704e88c0280d920d90b9e293f262f257dab33 Mon Sep 17 00:00:00 2001 From: Carson McManus Date: Tue, 17 Sep 2024 10:11:49 -0400 Subject: [PATCH] feat(parser/html): parse doctype declarations --- crates/biome_html_parser/src/lexer/mod.rs | 39 +++++++++----- crates/biome_html_parser/src/lexer/tests.rs | 4 ++ crates/biome_html_parser/src/syntax/mod.rs | 16 +++++- crates/biome_html_parser/src/token_source.rs | 4 ++ .../tests/html_specs/ok/doctype/basic.html | 1 + .../html_specs/ok/doctype/basic.html.snap | 50 ++++++++++++++++++ .../html_specs/ok/doctype/long-legacy1.html | 1 + .../ok/doctype/long-legacy1.html.snap | 50 ++++++++++++++++++ .../html_specs/ok/doctype/long-legacy2.html | 1 + .../ok/doctype/long-legacy2.html.snap | 50 ++++++++++++++++++ .../html_specs/ok/doctype/long-legacy3.html | 2 + .../ok/doctype/long-legacy3.html.snap | 51 +++++++++++++++++++ .../tests/html_specs/ok/doctype/minimal.html | 1 + .../html_specs/ok/doctype/minimal.html.snap | 50 ++++++++++++++++++ 14 files changed, 304 insertions(+), 16 deletions(-) create mode 100644 crates/biome_html_parser/tests/html_specs/ok/doctype/basic.html create mode 100644 crates/biome_html_parser/tests/html_specs/ok/doctype/basic.html.snap create mode 100644 crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy1.html create mode 100644 crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy1.html.snap create mode 100644 crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy2.html create mode 100644 crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy2.html.snap create mode 100644 crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy3.html create mode 100644 crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy3.html.snap create mode 100644 crates/biome_html_parser/tests/html_specs/ok/doctype/minimal.html create mode 100644 crates/biome_html_parser/tests/html_specs/ok/doctype/minimal.html.snap diff --git a/crates/biome_html_parser/src/lexer/mod.rs b/crates/biome_html_parser/src/lexer/mod.rs index b8294abe2e62..2df6e0b21b2b 100644 --- a/crates/biome_html_parser/src/lexer/mod.rs +++ b/crates/biome_html_parser/src/lexer/mod.rs @@ -33,8 +33,6 @@ pub(crate) struct HtmlLexer<'src> { after_newline: bool, unicode_bom_length: usize, - - after_doctype: bool, } impl<'src> HtmlLexer<'src> { @@ -49,24 +47,22 @@ impl<'src> HtmlLexer<'src> { after_newline: false, current_flags: TokenFlags::empty(), unicode_bom_length: 0, - after_doctype: false, } } + + /// Consume a token in the [HtmlLexContext::Regular] context. fn consume_token(&mut self, current: u8) -> HtmlSyntaxKind { match current { b'\n' | b'\r' | b'\t' | b' ' => self.consume_newline_or_whitespaces(), b'<' => self.consume_l_angle(), - b'>' => { - self.after_doctype = false; - self.consume_byte(T![>]) - } + b'>' => self.consume_byte(T![>]), b'/' => self.consume_byte(T![/]), b'=' => self.consume_byte(T![=]), b'!' => self.consume_byte(T![!]), b'\'' | b'"' => self.consume_string_literal(current), // TODO: differentiate between attribute names and identifiers _ if is_identifier_byte(current) || is_attribute_name_byte(current) => { - self.consume_identifier(current) + self.consume_identifier(current, false) } _ => { if self.position == 0 { @@ -80,6 +76,7 @@ impl<'src> HtmlLexer<'src> { } } + /// Consume a token in the [HtmlLexContext::OutsideTag] context. fn consume_token_outside_tag(&mut self, current: u8) -> HtmlSyntaxKind { match current { b'\n' | b'\r' | b'\t' | b' ' => self.consume_newline_or_whitespaces(), @@ -88,6 +85,7 @@ impl<'src> HtmlLexer<'src> { } } + /// Consume a token in the [HtmlLexContext::AttributeValue] context. fn consume_token_attribute_value(&mut self, current: u8) -> HtmlSyntaxKind { match current { b'\n' | b'\r' | b'\t' | b' ' => self.consume_newline_or_whitespaces(), @@ -98,6 +96,21 @@ impl<'src> HtmlLexer<'src> { } } + /// Consume a token in the [HtmlLexContext::Doctype] context. + fn consume_token_doctype(&mut self, current: u8) -> HtmlSyntaxKind { + match current { + b'\n' | b'\r' | b'\t' | b' ' => self.consume_newline_or_whitespaces(), + b'<' => self.consume_byte(T![<]), + b'>' => self.consume_byte(T![>]), + b'!' => self.consume_byte(T![!]), + b'\'' | b'"' => self.consume_string_literal(current), + _ if is_identifier_byte(current) || is_attribute_name_byte(current) => { + self.consume_identifier(current, true) + } + _ => self.consume_unexpected_character(), + } + } + /// Bumps the current byte and creates a lexed token of the passed in kind. #[inline] fn consume_byte(&mut self, tok: HtmlSyntaxKind) -> HtmlSyntaxKind { @@ -125,7 +138,7 @@ impl<'src> HtmlLexer<'src> { debug_assert!(self.source.is_char_boundary(self.position)); } - fn consume_identifier(&mut self, first: u8) -> HtmlSyntaxKind { + fn consume_identifier(&mut self, first: u8, doctype_context: bool) -> HtmlSyntaxKind { self.assert_current_char_boundary(); const BUFFER_SIZE: usize = 14; @@ -149,11 +162,8 @@ impl<'src> HtmlLexer<'src> { } match &buffer[..len] { - b"doctype" | b"DOCTYPE" => { - self.after_doctype = true; - DOCTYPE_KW - } - b"html" | b"HTML" if self.after_doctype => HTML_KW, + b"doctype" | b"DOCTYPE" => DOCTYPE_KW, + b"html" | b"HTML" if doctype_context => HTML_KW, _ => HTML_LITERAL, } } @@ -431,6 +441,7 @@ impl<'src> Lexer<'src> for HtmlLexer<'src> { HtmlLexContext::Regular => self.consume_token(current), HtmlLexContext::OutsideTag => self.consume_token_outside_tag(current), HtmlLexContext::AttributeValue => self.consume_token_attribute_value(current), + HtmlLexContext::Doctype => self.consume_token_doctype(current), }, None => EOF, } diff --git a/crates/biome_html_parser/src/lexer/tests.rs b/crates/biome_html_parser/src/lexer/tests.rs index e5ecdfd921e6..a541a6e6ec2e 100644 --- a/crates/biome_html_parser/src/lexer/tests.rs +++ b/crates/biome_html_parser/src/lexer/tests.rs @@ -105,6 +105,7 @@ macro_rules! assert_lex { #[test] fn doctype_key() { assert_lex! { + HtmlLexContext::Doctype, "doctype", DOCTYPE_KW: 7, } @@ -113,6 +114,7 @@ fn doctype_key() { #[test] fn doctype_upper_key() { assert_lex! { + HtmlLexContext::Doctype, "DOCTYPE", DOCTYPE_KW: 7, } @@ -164,6 +166,7 @@ fn html_text() { #[test] fn doctype_with_quirk() { assert_lex! { + HtmlLexContext::Doctype, "", L_ANGLE: 1, BANG: 1, @@ -177,6 +180,7 @@ fn doctype_with_quirk() { #[test] fn doctype_with_quirk_and_system() { assert_lex! { + HtmlLexContext::Doctype, "", L_ANGLE: 1, BANG: 1, diff --git a/crates/biome_html_parser/src/syntax/mod.rs b/crates/biome_html_parser/src/syntax/mod.rs index e7cca9c9d8f8..d5b0987ad9cc 100644 --- a/crates/biome_html_parser/src/syntax/mod.rs +++ b/crates/biome_html_parser/src/syntax/mod.rs @@ -41,11 +41,23 @@ fn parse_doc_type(p: &mut HtmlParser) -> ParsedSyntax { p.bump(T![!]); if p.at(T![doctype]) { - p.eat(T![doctype]); + p.eat_with_context(T![doctype], HtmlLexContext::Doctype); } if p.at(T![html]) { - p.eat(T![html]); + p.eat_with_context(T![html], HtmlLexContext::Doctype); + } + + if p.at(HTML_LITERAL) { + p.eat_with_context(HTML_LITERAL, HtmlLexContext::Doctype); + } + + if p.at(HTML_STRING_LITERAL) { + p.eat_with_context(HTML_STRING_LITERAL, HtmlLexContext::Doctype); + } + + if p.at(HTML_STRING_LITERAL) { + p.eat_with_context(HTML_STRING_LITERAL, HtmlLexContext::Doctype); } p.eat(T![>]); diff --git a/crates/biome_html_parser/src/token_source.rs b/crates/biome_html_parser/src/token_source.rs index 336f0b8ccd6a..f5edc6d92539 100644 --- a/crates/biome_html_parser/src/token_source.rs +++ b/crates/biome_html_parser/src/token_source.rs @@ -27,6 +27,10 @@ pub(crate) enum HtmlLexContext { /// /// This is because attribute values can start and end with a `"` or `'` character, or be unquoted, and the lexer needs to know to start lexing a string literal. AttributeValue, + /// Enables the `html` keyword token. + /// + /// When the parser has encounters the sequence `` token is encountered. + Doctype, } impl LexContext for HtmlLexContext { diff --git a/crates/biome_html_parser/tests/html_specs/ok/doctype/basic.html b/crates/biome_html_parser/tests/html_specs/ok/doctype/basic.html new file mode 100644 index 000000000000..0e76edd65b7b --- /dev/null +++ b/crates/biome_html_parser/tests/html_specs/ok/doctype/basic.html @@ -0,0 +1 @@ + diff --git a/crates/biome_html_parser/tests/html_specs/ok/doctype/basic.html.snap b/crates/biome_html_parser/tests/html_specs/ok/doctype/basic.html.snap new file mode 100644 index 000000000000..5679e1bd7039 --- /dev/null +++ b/crates/biome_html_parser/tests/html_specs/ok/doctype/basic.html.snap @@ -0,0 +1,50 @@ +--- +source: crates/biome_html_parser/tests/spec_test.rs +expression: snapshot +--- +## Input + +```html + + +``` + + +## AST + +``` +HtmlRoot { + bom_token: missing (optional), + directive: HtmlDirective { + l_angle_token: L_ANGLE@0..1 "<" [] [], + excl_token: BANG@1..2 "!" [] [], + doctype_token: DOCTYPE_KW@2..10 "DOCTYPE" [] [Whitespace(" ")], + html_token: HTML_KW@10..14 "html" [] [], + quirk_token: missing (optional), + public_id_token: missing (optional), + system_id_token: missing (optional), + r_angle_token: R_ANGLE@14..15 ">" [] [], + }, + html: missing (optional), + eof_token: EOF@15..16 "" [Newline("\n")] [], +} +``` + +## CST + +``` +0: HTML_ROOT@0..16 + 0: (empty) + 1: HTML_DIRECTIVE@0..15 + 0: L_ANGLE@0..1 "<" [] [] + 1: BANG@1..2 "!" [] [] + 2: DOCTYPE_KW@2..10 "DOCTYPE" [] [Whitespace(" ")] + 3: HTML_KW@10..14 "html" [] [] + 4: (empty) + 5: (empty) + 6: (empty) + 7: R_ANGLE@14..15 ">" [] [] + 2: (empty) + 3: EOF@15..16 "" [Newline("\n")] [] + +``` diff --git a/crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy1.html b/crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy1.html new file mode 100644 index 000000000000..77a55353f20c --- /dev/null +++ b/crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy1.html @@ -0,0 +1 @@ + diff --git a/crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy1.html.snap b/crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy1.html.snap new file mode 100644 index 000000000000..81461ab4d254 --- /dev/null +++ b/crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy1.html.snap @@ -0,0 +1,50 @@ +--- +source: crates/biome_html_parser/tests/spec_test.rs +expression: snapshot +--- +## Input + +```html + + +``` + + +## AST + +``` +HtmlRoot { + bom_token: missing (optional), + directive: HtmlDirective { + l_angle_token: L_ANGLE@0..1 "<" [] [], + excl_token: BANG@1..2 "!" [] [], + doctype_token: DOCTYPE_KW@2..10 "DOCTYPE" [] [Whitespace(" ")], + html_token: HTML_KW@10..15 "html" [] [Whitespace(" ")], + quirk_token: HTML_LITERAL@15..22 "SYSTEM" [] [Whitespace(" ")], + public_id_token: HTML_STRING_LITERAL@22..43 "\"about:legacy-compat\"" [] [], + system_id_token: missing (optional), + r_angle_token: R_ANGLE@43..44 ">" [] [], + }, + html: missing (optional), + eof_token: EOF@44..45 "" [Newline("\n")] [], +} +``` + +## CST + +``` +0: HTML_ROOT@0..45 + 0: (empty) + 1: HTML_DIRECTIVE@0..44 + 0: L_ANGLE@0..1 "<" [] [] + 1: BANG@1..2 "!" [] [] + 2: DOCTYPE_KW@2..10 "DOCTYPE" [] [Whitespace(" ")] + 3: HTML_KW@10..15 "html" [] [Whitespace(" ")] + 4: HTML_LITERAL@15..22 "SYSTEM" [] [Whitespace(" ")] + 5: HTML_STRING_LITERAL@22..43 "\"about:legacy-compat\"" [] [] + 6: (empty) + 7: R_ANGLE@43..44 ">" [] [] + 2: (empty) + 3: EOF@44..45 "" [Newline("\n")] [] + +``` diff --git a/crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy2.html b/crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy2.html new file mode 100644 index 000000000000..fb3075d9e6f5 --- /dev/null +++ b/crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy2.html @@ -0,0 +1 @@ + diff --git a/crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy2.html.snap b/crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy2.html.snap new file mode 100644 index 000000000000..92052f5e7cc2 --- /dev/null +++ b/crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy2.html.snap @@ -0,0 +1,50 @@ +--- +source: crates/biome_html_parser/tests/spec_test.rs +expression: snapshot +--- +## Input + +```html + + +``` + + +## AST + +``` +HtmlRoot { + bom_token: missing (optional), + directive: HtmlDirective { + l_angle_token: L_ANGLE@0..1 "<" [] [], + excl_token: BANG@1..2 "!" [] [], + doctype_token: DOCTYPE_KW@2..10 "DOCTYPE" [] [Whitespace(" ")], + html_token: HTML_KW@10..15 "html" [] [Whitespace(" ")], + quirk_token: HTML_LITERAL@15..22 "PUBLIC" [] [Whitespace(" ")], + public_id_token: HTML_STRING_LITERAL@22..49 "\"-//W3C//DTD HTML 4.01//EN\"" [] [], + system_id_token: missing (optional), + r_angle_token: R_ANGLE@49..50 ">" [] [], + }, + html: missing (optional), + eof_token: EOF@50..51 "" [Newline("\n")] [], +} +``` + +## CST + +``` +0: HTML_ROOT@0..51 + 0: (empty) + 1: HTML_DIRECTIVE@0..50 + 0: L_ANGLE@0..1 "<" [] [] + 1: BANG@1..2 "!" [] [] + 2: DOCTYPE_KW@2..10 "DOCTYPE" [] [Whitespace(" ")] + 3: HTML_KW@10..15 "html" [] [Whitespace(" ")] + 4: HTML_LITERAL@15..22 "PUBLIC" [] [Whitespace(" ")] + 5: HTML_STRING_LITERAL@22..49 "\"-//W3C//DTD HTML 4.01//EN\"" [] [] + 6: (empty) + 7: R_ANGLE@49..50 ">" [] [] + 2: (empty) + 3: EOF@50..51 "" [Newline("\n")] [] + +``` diff --git a/crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy3.html b/crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy3.html new file mode 100644 index 000000000000..24d3759e2593 --- /dev/null +++ b/crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy3.html @@ -0,0 +1,2 @@ + diff --git a/crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy3.html.snap b/crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy3.html.snap new file mode 100644 index 000000000000..0cad35915031 --- /dev/null +++ b/crates/biome_html_parser/tests/html_specs/ok/doctype/long-legacy3.html.snap @@ -0,0 +1,51 @@ +--- +source: crates/biome_html_parser/tests/spec_test.rs +expression: snapshot +--- +## Input + +```html + + +``` + + +## AST + +``` +HtmlRoot { + bom_token: missing (optional), + directive: HtmlDirective { + l_angle_token: L_ANGLE@0..1 "<" [] [], + excl_token: BANG@1..2 "!" [] [], + doctype_token: DOCTYPE_KW@2..10 "DOCTYPE" [] [Whitespace(" ")], + html_token: HTML_KW@10..15 "HTML" [] [Whitespace(" ")], + quirk_token: HTML_LITERAL@15..22 "PUBLIC" [] [Whitespace(" ")], + public_id_token: HTML_STRING_LITERAL@22..58 "\"-//W3C//DTD HTML 4.01 Frameset//EN\"" [] [], + system_id_token: HTML_STRING_LITERAL@58..102 "\"http://www.w3.org/TR/html4/frameset.dtd\"" [Newline("\n"), Whitespace(" ")] [], + r_angle_token: R_ANGLE@102..103 ">" [] [], + }, + html: missing (optional), + eof_token: EOF@103..104 "" [Newline("\n")] [], +} +``` + +## CST + +``` +0: HTML_ROOT@0..104 + 0: (empty) + 1: HTML_DIRECTIVE@0..103 + 0: L_ANGLE@0..1 "<" [] [] + 1: BANG@1..2 "!" [] [] + 2: DOCTYPE_KW@2..10 "DOCTYPE" [] [Whitespace(" ")] + 3: HTML_KW@10..15 "HTML" [] [Whitespace(" ")] + 4: HTML_LITERAL@15..22 "PUBLIC" [] [Whitespace(" ")] + 5: HTML_STRING_LITERAL@22..58 "\"-//W3C//DTD HTML 4.01 Frameset//EN\"" [] [] + 6: HTML_STRING_LITERAL@58..102 "\"http://www.w3.org/TR/html4/frameset.dtd\"" [Newline("\n"), Whitespace(" ")] [] + 7: R_ANGLE@102..103 ">" [] [] + 2: (empty) + 3: EOF@103..104 "" [Newline("\n")] [] + +``` diff --git a/crates/biome_html_parser/tests/html_specs/ok/doctype/minimal.html b/crates/biome_html_parser/tests/html_specs/ok/doctype/minimal.html new file mode 100644 index 000000000000..793f0b40f392 --- /dev/null +++ b/crates/biome_html_parser/tests/html_specs/ok/doctype/minimal.html @@ -0,0 +1 @@ + diff --git a/crates/biome_html_parser/tests/html_specs/ok/doctype/minimal.html.snap b/crates/biome_html_parser/tests/html_specs/ok/doctype/minimal.html.snap new file mode 100644 index 000000000000..ff82a3c32849 --- /dev/null +++ b/crates/biome_html_parser/tests/html_specs/ok/doctype/minimal.html.snap @@ -0,0 +1,50 @@ +--- +source: crates/biome_html_parser/tests/spec_test.rs +expression: snapshot +--- +## Input + +```html + + +``` + + +## AST + +``` +HtmlRoot { + bom_token: missing (optional), + directive: HtmlDirective { + l_angle_token: L_ANGLE@0..1 "<" [] [], + excl_token: BANG@1..2 "!" [] [], + doctype_token: DOCTYPE_KW@2..9 "DOCTYPE" [] [], + html_token: missing (optional), + quirk_token: missing (optional), + public_id_token: missing (optional), + system_id_token: missing (optional), + r_angle_token: R_ANGLE@9..10 ">" [] [], + }, + html: missing (optional), + eof_token: EOF@10..11 "" [Newline("\n")] [], +} +``` + +## CST + +``` +0: HTML_ROOT@0..11 + 0: (empty) + 1: HTML_DIRECTIVE@0..10 + 0: L_ANGLE@0..1 "<" [] [] + 1: BANG@1..2 "!" [] [] + 2: DOCTYPE_KW@2..9 "DOCTYPE" [] [] + 3: (empty) + 4: (empty) + 5: (empty) + 6: (empty) + 7: R_ANGLE@9..10 ">" [] [] + 2: (empty) + 3: EOF@10..11 "" [Newline("\n")] [] + +```