Skip to content

Commit

Permalink
feat(parser/html): parse doctype declarations
Browse files Browse the repository at this point in the history
  • Loading branch information
dyc3 committed Sep 18, 2024
1 parent a7b623a commit 17f704e
Show file tree
Hide file tree
Showing 14 changed files with 304 additions and 16 deletions.
39 changes: 25 additions & 14 deletions crates/biome_html_parser/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,6 @@ pub(crate) struct HtmlLexer<'src> {
after_newline: bool,

unicode_bom_length: usize,

after_doctype: bool,
}

impl<'src> HtmlLexer<'src> {
Expand All @@ -49,24 +47,22 @@ impl<'src> HtmlLexer<'src> {
after_newline: false,
current_flags: TokenFlags::empty(),
unicode_bom_length: 0,
after_doctype: false,
}
}

/// Consume a token in the [HtmlLexContext::Regular] context.
fn consume_token(&mut self, current: u8) -> HtmlSyntaxKind {
match current {
b'\n' | b'\r' | b'\t' | b' ' => self.consume_newline_or_whitespaces(),
b'<' => self.consume_l_angle(),
b'>' => {
self.after_doctype = false;
self.consume_byte(T![>])
}
b'>' => self.consume_byte(T![>]),
b'/' => self.consume_byte(T![/]),
b'=' => self.consume_byte(T![=]),
b'!' => self.consume_byte(T![!]),
b'\'' | b'"' => self.consume_string_literal(current),
// TODO: differentiate between attribute names and identifiers
_ if is_identifier_byte(current) || is_attribute_name_byte(current) => {
self.consume_identifier(current)
self.consume_identifier(current, false)
}
_ => {
if self.position == 0 {
Expand All @@ -80,6 +76,7 @@ impl<'src> HtmlLexer<'src> {
}
}

/// Consume a token in the [HtmlLexContext::OutsideTag] context.
fn consume_token_outside_tag(&mut self, current: u8) -> HtmlSyntaxKind {
match current {
b'\n' | b'\r' | b'\t' | b' ' => self.consume_newline_or_whitespaces(),
Expand All @@ -88,6 +85,7 @@ impl<'src> HtmlLexer<'src> {
}
}

/// Consume a token in the [HtmlLexContext::AttributeValue] context.
fn consume_token_attribute_value(&mut self, current: u8) -> HtmlSyntaxKind {
match current {
b'\n' | b'\r' | b'\t' | b' ' => self.consume_newline_or_whitespaces(),
Expand All @@ -98,6 +96,21 @@ impl<'src> HtmlLexer<'src> {
}
}

/// Consume a token in the [HtmlLexContext::Doctype] context.
fn consume_token_doctype(&mut self, current: u8) -> HtmlSyntaxKind {
match current {
b'\n' | b'\r' | b'\t' | b' ' => self.consume_newline_or_whitespaces(),
b'<' => self.consume_byte(T![<]),
b'>' => self.consume_byte(T![>]),
b'!' => self.consume_byte(T![!]),
b'\'' | b'"' => self.consume_string_literal(current),
_ if is_identifier_byte(current) || is_attribute_name_byte(current) => {
self.consume_identifier(current, true)
}
_ => self.consume_unexpected_character(),
}
}

/// Bumps the current byte and creates a lexed token of the passed in kind.
#[inline]
fn consume_byte(&mut self, tok: HtmlSyntaxKind) -> HtmlSyntaxKind {
Expand Down Expand Up @@ -125,7 +138,7 @@ impl<'src> HtmlLexer<'src> {
debug_assert!(self.source.is_char_boundary(self.position));
}

fn consume_identifier(&mut self, first: u8) -> HtmlSyntaxKind {
fn consume_identifier(&mut self, first: u8, doctype_context: bool) -> HtmlSyntaxKind {
self.assert_current_char_boundary();

const BUFFER_SIZE: usize = 14;
Expand All @@ -149,11 +162,8 @@ impl<'src> HtmlLexer<'src> {
}

match &buffer[..len] {
b"doctype" | b"DOCTYPE" => {
self.after_doctype = true;
DOCTYPE_KW
}
b"html" | b"HTML" if self.after_doctype => HTML_KW,
b"doctype" | b"DOCTYPE" => DOCTYPE_KW,
b"html" | b"HTML" if doctype_context => HTML_KW,
_ => HTML_LITERAL,
}
}
Expand Down Expand Up @@ -431,6 +441,7 @@ impl<'src> Lexer<'src> for HtmlLexer<'src> {
HtmlLexContext::Regular => self.consume_token(current),
HtmlLexContext::OutsideTag => self.consume_token_outside_tag(current),
HtmlLexContext::AttributeValue => self.consume_token_attribute_value(current),
HtmlLexContext::Doctype => self.consume_token_doctype(current),
},
None => EOF,
}
Expand Down
4 changes: 4 additions & 0 deletions crates/biome_html_parser/src/lexer/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ macro_rules! assert_lex {
#[test]
fn doctype_key() {
assert_lex! {
HtmlLexContext::Doctype,
"doctype",
DOCTYPE_KW: 7,
}
Expand All @@ -113,6 +114,7 @@ fn doctype_key() {
#[test]
fn doctype_upper_key() {
assert_lex! {
HtmlLexContext::Doctype,
"DOCTYPE",
DOCTYPE_KW: 7,
}
Expand Down Expand Up @@ -164,6 +166,7 @@ fn html_text() {
#[test]
fn doctype_with_quirk() {
assert_lex! {
HtmlLexContext::Doctype,
"<!DOCTYPE HTML>",
L_ANGLE: 1,
BANG: 1,
Expand All @@ -177,6 +180,7 @@ fn doctype_with_quirk() {
#[test]
fn doctype_with_quirk_and_system() {
assert_lex! {
HtmlLexContext::Doctype,
"<!DOCTYPE HTML \"+//silmaril//dtd html pro v0r11 19970101//\">",
L_ANGLE: 1,
BANG: 1,
Expand Down
16 changes: 14 additions & 2 deletions crates/biome_html_parser/src/syntax/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,23 @@ fn parse_doc_type(p: &mut HtmlParser) -> ParsedSyntax {
p.bump(T![!]);

if p.at(T![doctype]) {
p.eat(T![doctype]);
p.eat_with_context(T![doctype], HtmlLexContext::Doctype);
}

if p.at(T![html]) {
p.eat(T![html]);
p.eat_with_context(T![html], HtmlLexContext::Doctype);
}

if p.at(HTML_LITERAL) {
p.eat_with_context(HTML_LITERAL, HtmlLexContext::Doctype);
}

if p.at(HTML_STRING_LITERAL) {
p.eat_with_context(HTML_STRING_LITERAL, HtmlLexContext::Doctype);
}

if p.at(HTML_STRING_LITERAL) {
p.eat_with_context(HTML_STRING_LITERAL, HtmlLexContext::Doctype);
}

p.eat(T![>]);
Expand Down
4 changes: 4 additions & 0 deletions crates/biome_html_parser/src/token_source.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ pub(crate) enum HtmlLexContext {
///
/// This is because attribute values can start and end with a `"` or `'` character, or be unquoted, and the lexer needs to know to start lexing a string literal.
AttributeValue,
/// Enables the `html` keyword token.
///
/// When the parser has encounters the sequence `<!DOCTYPE`, it switches to this context. It will remain in this context until the next `>` token is encountered.
Doctype,
}

impl LexContext for HtmlLexContext {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<!DOCTYPE html>
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
---
source: crates/biome_html_parser/tests/spec_test.rs
expression: snapshot
---
## Input

```html
<!DOCTYPE html>
```


## AST

```
HtmlRoot {
bom_token: missing (optional),
directive: HtmlDirective {
l_angle_token: L_ANGLE@0..1 "<" [] [],
excl_token: BANG@1..2 "!" [] [],
doctype_token: DOCTYPE_KW@2..10 "DOCTYPE" [] [Whitespace(" ")],
html_token: HTML_KW@10..14 "html" [] [],
quirk_token: missing (optional),
public_id_token: missing (optional),
system_id_token: missing (optional),
r_angle_token: R_ANGLE@14..15 ">" [] [],
},
html: missing (optional),
eof_token: EOF@15..16 "" [Newline("\n")] [],
}
```

## CST

```
0: [email protected]
0: (empty)
1: [email protected]
0: [email protected] "<" [] []
1: [email protected] "!" [] []
2: [email protected] "DOCTYPE" [] [Whitespace(" ")]
3: [email protected] "html" [] []
4: (empty)
5: (empty)
6: (empty)
7: [email protected] ">" [] []
2: (empty)
3: [email protected] "" [Newline("\n")] []
```
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<!DOCTYPE html SYSTEM "about:legacy-compat">
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
---
source: crates/biome_html_parser/tests/spec_test.rs
expression: snapshot
---
## Input

```html
<!DOCTYPE html SYSTEM "about:legacy-compat">
```


## AST

```
HtmlRoot {
bom_token: missing (optional),
directive: HtmlDirective {
l_angle_token: L_ANGLE@0..1 "<" [] [],
excl_token: BANG@1..2 "!" [] [],
doctype_token: DOCTYPE_KW@2..10 "DOCTYPE" [] [Whitespace(" ")],
html_token: HTML_KW@10..15 "html" [] [Whitespace(" ")],
quirk_token: HTML_LITERAL@15..22 "SYSTEM" [] [Whitespace(" ")],
public_id_token: HTML_STRING_LITERAL@22..43 "\"about:legacy-compat\"" [] [],
system_id_token: missing (optional),
r_angle_token: R_ANGLE@43..44 ">" [] [],
},
html: missing (optional),
eof_token: EOF@44..45 "" [Newline("\n")] [],
}
```

## CST

```
0: [email protected]
0: (empty)
1: [email protected]
0: [email protected] "<" [] []
1: [email protected] "!" [] []
2: [email protected] "DOCTYPE" [] [Whitespace(" ")]
3: [email protected] "html" [] [Whitespace(" ")]
4: [email protected] "SYSTEM" [] [Whitespace(" ")]
5: [email protected] "\"about:legacy-compat\"" [] []
6: (empty)
7: [email protected] ">" [] []
2: (empty)
3: [email protected] "" [Newline("\n")] []
```
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
---
source: crates/biome_html_parser/tests/spec_test.rs
expression: snapshot
---
## Input

```html
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
```


## AST

```
HtmlRoot {
bom_token: missing (optional),
directive: HtmlDirective {
l_angle_token: L_ANGLE@0..1 "<" [] [],
excl_token: BANG@1..2 "!" [] [],
doctype_token: DOCTYPE_KW@2..10 "DOCTYPE" [] [Whitespace(" ")],
html_token: HTML_KW@10..15 "html" [] [Whitespace(" ")],
quirk_token: HTML_LITERAL@15..22 "PUBLIC" [] [Whitespace(" ")],
public_id_token: HTML_STRING_LITERAL@22..49 "\"-//W3C//DTD HTML 4.01//EN\"" [] [],
system_id_token: missing (optional),
r_angle_token: R_ANGLE@49..50 ">" [] [],
},
html: missing (optional),
eof_token: EOF@50..51 "" [Newline("\n")] [],
}
```

## CST

```
0: [email protected]
0: (empty)
1: [email protected]
0: [email protected] "<" [] []
1: [email protected] "!" [] []
2: [email protected] "DOCTYPE" [] [Whitespace(" ")]
3: [email protected] "html" [] [Whitespace(" ")]
4: [email protected] "PUBLIC" [] [Whitespace(" ")]
5: [email protected] "\"-//W3C//DTD HTML 4.01//EN\"" [] []
6: (empty)
7: [email protected] ">" [] []
2: (empty)
3: [email protected] "" [Newline("\n")] []
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Frameset//EN"
"http://www.w3.org/TR/html4/frameset.dtd">
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
---
source: crates/biome_html_parser/tests/spec_test.rs
expression: snapshot
---
## Input

```html
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Frameset//EN"
"http://www.w3.org/TR/html4/frameset.dtd">
```


## AST

```
HtmlRoot {
bom_token: missing (optional),
directive: HtmlDirective {
l_angle_token: L_ANGLE@0..1 "<" [] [],
excl_token: BANG@1..2 "!" [] [],
doctype_token: DOCTYPE_KW@2..10 "DOCTYPE" [] [Whitespace(" ")],
html_token: HTML_KW@10..15 "HTML" [] [Whitespace(" ")],
quirk_token: HTML_LITERAL@15..22 "PUBLIC" [] [Whitespace(" ")],
public_id_token: HTML_STRING_LITERAL@22..58 "\"-//W3C//DTD HTML 4.01 Frameset//EN\"" [] [],
system_id_token: HTML_STRING_LITERAL@58..102 "\"http://www.w3.org/TR/html4/frameset.dtd\"" [Newline("\n"), Whitespace(" ")] [],
r_angle_token: R_ANGLE@102..103 ">" [] [],
},
html: missing (optional),
eof_token: EOF@103..104 "" [Newline("\n")] [],
}
```

## CST

```
0: [email protected]
0: (empty)
1: [email protected]
0: [email protected] "<" [] []
1: [email protected] "!" [] []
2: [email protected] "DOCTYPE" [] [Whitespace(" ")]
3: [email protected] "HTML" [] [Whitespace(" ")]
4: [email protected] "PUBLIC" [] [Whitespace(" ")]
5: [email protected] "\"-//W3C//DTD HTML 4.01 Frameset//EN\"" [] []
6: [email protected] "\"http://www.w3.org/TR/html4/frameset.dtd\"" [Newline("\n"), Whitespace(" ")] []
7: [email protected] ">" [] []
2: (empty)
3: [email protected] "" [Newline("\n")] []
```
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<!DOCTYPE>
Loading

0 comments on commit 17f704e

Please sign in to comment.