Skip to content

Commit

Permalink
fix(parser/html): correctly lex tag names and attribute names
Browse files Browse the repository at this point in the history
  • Loading branch information
dyc3 committed Sep 10, 2024
1 parent 7ffc53f commit 663a823
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 2 deletions.
18 changes: 16 additions & 2 deletions crates/biome_html_parser/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,10 @@ impl<'src> HtmlLexer<'src> {
b'!' => self.consume_byte(T![!]),
b'=' => self.consume_byte(T![=]),
b'\'' | b'"' => self.consume_string_literal(current),
_ if is_identifier_byte(current) => self.consume_identifier(current),
// TODO: differentiate between attribute names and identifiers
_ if is_identifier_byte(current) || is_attribute_name_byte(current) => {
self.consume_identifier(current)
}
_ => {
if self.position == 0 {
if let Some((bom, bom_size)) = self.consume_potential_bom(UNICODE_BOM) {
Expand Down Expand Up @@ -141,7 +144,7 @@ impl<'src> HtmlLexer<'src> {
self.advance_byte_or_char(first);

while let Some(byte) = self.current_byte() {
if is_identifier_byte(byte) {
if is_identifier_byte(byte) || is_attribute_name_byte(byte) {
if len < BUFFER_SIZE {
buffer[len] = byte;
len += 1;
Expand Down Expand Up @@ -434,9 +437,20 @@ impl<'src> Lexer<'src> for HtmlLexer<'src> {
}

fn is_identifier_byte(byte: u8) -> bool {
// https://html.spec.whatwg.org/#elements-2
byte.is_ascii_alphanumeric()
}

fn is_attribute_name_byte(byte: u8) -> bool {
// https://html.spec.whatwg.org/#attributes-2
byte.is_ascii()
&& !byte.is_ascii_control()
&& !matches!(
byte,
b' ' | b'\t' | b'\n' | b'"' | b'\'' | b'>' | b'<' | b'/' | b'='
)
}

#[derive(Copy, Clone, Debug)]
enum LexStringState {
/// String that contains an invalid escape sequence
Expand Down
27 changes: 27 additions & 0 deletions crates/biome_html_parser/src/lexer/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,21 @@ fn element() {
}
}

#[test]
fn element_with_text() {
assert_lex! {
"<div>abcdefghijklmnopqrstuvwxyz!@_-:;</div>",
L_ANGLE: 1,
HTML_LITERAL: 3,
R_ANGLE: 1,
HTML_LITERAL: 32,
L_ANGLE: 1,
SLASH: 1,
HTML_LITERAL: 3,
R_ANGLE: 1,
}
}

#[test]
fn doctype_with_quirk() {
assert_lex! {
Expand Down Expand Up @@ -191,6 +206,18 @@ fn element_with_attributes() {
}
}

#[test]
fn element_with_dashed_attributes() {
assert_lex! {
"<div aria-hidden>",
L_ANGLE: 1,
HTML_LITERAL: 3,
WHITESPACE: 1,
HTML_LITERAL: 11,
R_ANGLE: 1,
}
}

#[test]
fn html_element() {
assert_lex! {
Expand Down

0 comments on commit 663a823

Please sign in to comment.