diff --git a/.changeset/fix-html-bom-handling.md b/.changeset/fix-html-bom-handling.md new file mode 100644 index 000000000000..1f4f2cb105bc --- /dev/null +++ b/.changeset/fix-html-bom-handling.md @@ -0,0 +1,5 @@ +--- +"@biomejs/biome": patch +--- + +Fixed [#7919](https://github.com/biomejs/biome/issues/7919): The HTML parser now correctly handles Unicode BOM (Byte Order Mark) characters at the beginning of HTML files, ensuring proper parsing and tokenization. diff --git a/crates/biome_html_parser/src/lexer/mod.rs b/crates/biome_html_parser/src/lexer/mod.rs index 1a290af641fa..a346afb2e169 100644 --- a/crates/biome_html_parser/src/lexer/mod.rs +++ b/crates/biome_html_parser/src/lexer/mod.rs @@ -82,15 +82,7 @@ impl<'src> HtmlLexer<'src> { _ if self.current_kind != T![<] && is_attribute_name_byte(current) => { self.consume_identifier(current, false) } - _ => { - if self.position == 0 - && let Some((bom, bom_size)) = self.consume_potential_bom(UNICODE_BOM) - { - self.unicode_bom_length = bom_size; - return bom; - } - self.consume_unexpected_character() - } + _ => self.consume_unexpected_character(), } } @@ -134,7 +126,15 @@ impl<'src> HtmlLexer<'src> { self.consume_byte(HTML_LITERAL) } } - _ => self.consume_html_text(current), + _ => { + if self.position == 0 + && let Some((bom, bom_size)) = self.consume_potential_bom(UNICODE_BOM) + { + self.unicode_bom_length = bom_size; + return bom; + } + self.consume_html_text(current) + } } } diff --git a/crates/biome_html_parser/tests/html_specs/ok/bom.html b/crates/biome_html_parser/tests/html_specs/ok/bom.html new file mode 100644 index 000000000000..8ab3751e09f0 --- /dev/null +++ b/crates/biome_html_parser/tests/html_specs/ok/bom.html @@ -0,0 +1 @@ + diff --git a/crates/biome_html_parser/tests/html_specs/ok/bom.html.snap b/crates/biome_html_parser/tests/html_specs/ok/bom.html.snap new file mode 100644 index 000000000000..92a48c087871 --- /dev/null +++ b/crates/biome_html_parser/tests/html_specs/ok/bom.html.snap @@ -0,0 +1,53 @@ +--- +source: crates/biome_html_parser/tests/spec_test.rs +assertion_line: 138 +expression: snapshot +--- +## Input + +```html + + +``` + + +## AST + +``` +HtmlRoot { + bom_token: UNICODE_BOM@0..3 "\u{feff}" [] [], + frontmatter: missing (optional), + directive: HtmlDirective { + l_angle_token: L_ANGLE@3..4 "<" [] [], + excl_token: BANG@4..5 "!" [] [], + doctype_token: DOCTYPE_KW@5..12 "doctype" [] [], + html_token: missing (optional), + quirk_token: missing (optional), + public_id_token: missing (optional), + system_id_token: missing (optional), + r_angle_token: R_ANGLE@12..13 ">" [] [], + }, + html: HtmlElementList [], + eof_token: EOF@13..14 "" [Newline("\n")] [], +} +``` + +## CST + +``` +0: HTML_ROOT@0..14 + 0: UNICODE_BOM@0..3 "\u{feff}" [] [] + 1: (empty) + 2: HTML_DIRECTIVE@3..13 + 0: L_ANGLE@3..4 "<" [] [] + 1: BANG@4..5 "!" [] [] + 2: DOCTYPE_KW@5..12 "doctype" [] [] + 3: (empty) + 4: (empty) + 5: (empty) + 6: (empty) + 7: R_ANGLE@12..13 ">" [] [] + 3: HTML_ELEMENT_LIST@13..13 + 4: EOF@13..14 "" [Newline("\n")] [] + +```