Skip to content

Commit 9bf6eb3

Browse files
committed
Added parserdata structure to tokenizer
This parserdata structure is a gateway between the tokenizer and the parser. In a certain case (just one), the tokenizer needs to know the state of the parser to generate a correct token. The current setup has the tokenizer and parser in such a way, that we cannot easily reference eachother without borrow/check issues. THerefor we add this "hack", which finds out the data beforehand, and calls the tokenizer with this data. This means the call is done for each tokenizer call, instead of only when needed, but it saves a big refactor of the tokenizer/parser. In the future, we probably should separate the tokenizer, parser, and tree builder/sink structure so this is not an issue anymore.
1 parent dca4907 commit 9bf6eb3

File tree

6 files changed

+58
-23
lines changed

6 files changed

+58
-23
lines changed

src/bin/parser-test.rs

+4-4
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ fn main() {
2929
tests_failed: Vec::new(),
3030
};
3131

32-
let filenames = Some(&["tests2.dat"][..]);
32+
let filenames = Some(&["plain-text-unsafe.dat"][..]);
3333
let fixtures = read_fixtures(filenames).expect("fixtures");
3434

3535
for fixture_file in fixtures {
@@ -41,9 +41,9 @@ fn main() {
4141

4242
let mut test_idx = 1;
4343
for test in fixture_file.tests {
44-
// if test_idx == 57 {
45-
run_test(test_idx, test, &mut results);
46-
// }
44+
if test_idx == 11 {
45+
run_test(test_idx, test, &mut results);
46+
}
4747
test_idx += 1;
4848
}
4949

src/html5/parser.rs

+16-2
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ use crate::html5::parser::document::{Document, DocumentBuilder, DocumentFragment
1717
use crate::html5::parser::quirks::QuirksMode;
1818
use crate::html5::tokenizer::state::State;
1919
use crate::html5::tokenizer::token::Token;
20-
use crate::html5::tokenizer::{Tokenizer, CHAR_NUL, CHAR_REPLACEMENT};
20+
use crate::html5::tokenizer::{ParserData, Tokenizer, CHAR_NUL, CHAR_REPLACEMENT};
2121
use crate::types::{ParseError, Result};
2222
use alloc::rc::Rc;
2323
use core::cell::RefCell;
@@ -3752,13 +3752,27 @@ impl<'chars> Html5Parser<'chars> {
37523752
}
37533753
}
37543754

3755+
fn parser_data(&self) -> ParserData {
3756+
let namespace = self
3757+
.get_adjusted_current_node()
3758+
.namespace
3759+
.unwrap_or_default();
3760+
3761+
ParserData {
3762+
adjusted_node_namespace: namespace,
3763+
}
3764+
}
3765+
37553766
/// Fetches the next token from the tokenizer. However, if the token is a text token AND
37563767
/// it starts with one or more whitespaces, the token is split into 2 tokens: the whitespace part
37573768
/// and the remainder.
37583769
fn fetch_next_token(&mut self) -> Token {
37593770
// If there are no tokens to fetch, fetch the next token from the tokenizer
37603771
if self.token_queue.is_empty() {
3761-
let token = self.tokenizer.next_token().expect("tokenizer error");
3772+
let token = self
3773+
.tokenizer
3774+
.next_token(self.parser_data())
3775+
.expect("tokenizer error");
37623776

37633777
if let Token::Text(value) = token {
37643778
for c in value.chars() {

src/html5/tokenizer.rs

+24-6
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ use crate::bytes::Bytes::{self, *};
88
use crate::bytes::SeekMode::SeekCur;
99
use crate::bytes::{CharIterator, Position};
1010
use crate::html5::error_logger::{ErrorLogger, ParserError};
11+
use crate::html5::node::HTML_NAMESPACE;
1112
use crate::html5::tokenizer::state::State;
1213
use crate::html5::tokenizer::token::Token;
1314
use crate::types::{Error, Result};
@@ -50,6 +51,21 @@ pub struct Tokenizer<'stream> {
5051
pub error_logger: Rc<RefCell<ErrorLogger>>,
5152
}
5253

54+
/// This struct is a gateway between the parser and the tokenizer. It holds data that can be needed
55+
/// by the tokenizer in certain cases. See https://github.com/gosub-browser/gosub-engine/issues/230 for
56+
/// more information and how we should refactor this properly.
57+
pub struct ParserData {
58+
pub adjusted_node_namespace: String,
59+
}
60+
61+
impl Default for ParserData {
62+
fn default() -> Self {
63+
ParserData {
64+
adjusted_node_namespace: HTML_NAMESPACE.to_string(),
65+
}
66+
}
67+
}
68+
5369
/// Options that can be passed to the tokenizer. Mostly needed when dealing with tests.
5470
pub struct Options {
5571
/// Sets the initial state of the tokenizer. Normally only needed when dealing with tests
@@ -103,8 +119,8 @@ impl<'stream> Tokenizer<'stream> {
103119
}
104120

105121
/// Retrieves the next token from the input stream or Token::EOF when the end is reached
106-
pub fn next_token(&mut self) -> Result<Token> {
107-
self.consume_stream()?;
122+
pub fn next_token(&mut self, parser_data: ParserData) -> Result<Token> {
123+
self.consume_stream(parser_data)?;
108124

109125
if self.token_queue.is_empty() {
110126
return Ok(Token::Eof);
@@ -124,7 +140,7 @@ impl<'stream> Tokenizer<'stream> {
124140
}
125141

126142
/// Consumes the input stream. Continues until the stream is completed or a token has been generated.
127-
fn consume_stream(&mut self) -> Result<()> {
143+
fn consume_stream(&mut self, parser_data: ParserData) -> Result<()> {
128144
loop {
129145
// Something is already in the token buffer, so we can return it.
130146
if !self.token_queue.is_empty() {
@@ -1210,9 +1226,11 @@ impl<'stream> Tokenizer<'stream> {
12101226
if self.chars.look_ahead_slice(7) == "[CDATA[" {
12111227
self.chars.seek(SeekCur, 7);
12121228

1213-
// @TODO: If there is an adjusted current node and it is not an element in the HTML namespace,
1214-
// then switch to the CDATA section state. Otherwise, this is a cdata-in-html-content parse error.
1215-
// Create a comment token whose data is the "[CDATA[" string. Switch to the bogus comment state.
1229+
if parser_data.adjusted_node_namespace != HTML_NAMESPACE {
1230+
self.state = State::CDATASection;
1231+
continue;
1232+
}
1233+
12161234
self.parse_error(ParserError::CdataInHtmlContent);
12171235
self.current_token = Some(Token::Comment("[CDATA[".into()));
12181236

src/html5/tokenizer/character_reference.rs

+3-1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ use crate::bytes::{
66
};
77
use crate::html5::error_logger::ParserError;
88
use crate::html5::tokenizer::replacement_tables::{TOKEN_NAMED_CHARS, TOKEN_REPLACEMENTS};
9+
910
use crate::html5::tokenizer::{Tokenizer, CHAR_REPLACEMENT};
1011
use lazy_static::lazy_static;
1112

@@ -350,6 +351,7 @@ lazy_static! {
350351
#[cfg(test)]
351352
mod tests {
352353
use super::*;
354+
use crate::html5::tokenizer::ParserData;
353355
use crate::{bytes::CharIterator, html5::error_logger::ErrorLogger};
354356
use std::cell::RefCell;
355357
use std::rc::Rc;
@@ -367,7 +369,7 @@ mod tests {
367369
let error_logger = Rc::new(RefCell::new(ErrorLogger::new()));
368370
let mut tokenizer = Tokenizer::new(&mut chars, None, error_logger.clone());
369371

370-
let token = tokenizer.next_token().unwrap();
372+
let token = tokenizer.next_token(ParserData::default()).unwrap();
371373
assert_eq!(expected, token.to_string());
372374
}
373375
)*

src/testing/tokenizer.rs

+4-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
use super::FIXTURE_ROOT;
22
use crate::bytes::CharIterator;
3+
use crate::html5::tokenizer::ParserData;
34
use crate::html5::{
45
error_logger::ErrorLogger,
56
tokenizer::{
@@ -211,12 +212,12 @@ impl TestSpec {
211212
// If there is no output, still do an (initial) next token so the parser can generate
212213
// errors.
213214
if self.output.is_empty() {
214-
tokenizer.next_token().unwrap();
215+
tokenizer.next_token(ParserData::default()).unwrap();
215216
}
216217

217218
// There can be multiple tokens to match. Make sure we match all of them
218219
for expected in self.output.iter() {
219-
let actual = tokenizer.next_token().unwrap();
220+
let actual = tokenizer.next_token(ParserData::default()).unwrap();
220221
assert_eq!(self.escape(&actual), self.escape(expected));
221222
}
222223

@@ -237,7 +238,7 @@ impl TestSpec {
237238
let mut tokenizer = builder.build();
238239

239240
for _ in self.output.iter() {
240-
tokenizer.next_token().unwrap();
241+
tokenizer.next_token(ParserData::default()).unwrap();
241242
}
242243
}
243244
}

tests/tree_construction.rs

+7-7
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ const DISABLED_CASES: &[&str] = &[
1515
#[test_case("tests3.dat")]
1616
#[test_case("tests4.dat")]
1717
#[test_case("tests5.dat")]
18-
// #[test_case("tests6.dat")]
18+
#[test_case("tests6.dat")]
1919
#[test_case("tests7.dat")]
2020
#[test_case("tests8.dat")]
2121
#[test_case("tests9.dat")]
@@ -27,9 +27,9 @@ const DISABLED_CASES: &[&str] = &[
2727
// #[test_case("tests16.dat")]
2828
#[test_case("tests17.dat")]
2929
#[test_case("tests18.dat")]
30-
// #[test_case("tests19.dat")]
30+
#[test_case("tests19.dat")]
3131
#[test_case("tests20.dat")]
32-
// #[test_case("tests21.dat")]
32+
#[test_case("tests21.dat")]
3333
#[test_case("tests22.dat")]
3434
#[test_case("tests23.dat")]
3535
#[test_case("tests24.dat")]
@@ -40,11 +40,11 @@ const DISABLED_CASES: &[&str] = &[
4040
#[test_case("blocks.dat")]
4141
#[test_case("comments01.dat")]
4242
#[test_case("doctype01.dat")]
43-
// #[test_case("domjs-unsafe.dat")]
43+
#[test_case("domjs-unsafe.dat")]
4444
#[test_case("entities01.dat")]
4545
#[test_case("entities02.dat")]
4646
#[test_case("foreign-fragment.dat")]
47-
#[test_case("html5test-com.dat")]
47+
// #[test_case("html5test-com.dat")]
4848
#[test_case("inbody01.dat")]
4949
#[test_case("isindex.dat")]
5050
#[test_case("main-element.dat")]
@@ -54,7 +54,7 @@ const DISABLED_CASES: &[&str] = &[
5454
#[test_case("noscript01.dat")]
5555
#[test_case("pending-spec-changes.dat")]
5656
#[test_case("pending-spec-changes-plain-text-unsafe.dat")]
57-
// #[test_case("plain-text-unsafe.dat")]
57+
#[test_case("plain-text-unsafe.dat")]
5858
#[test_case("quirks01.dat")]
5959
#[test_case("ruby.dat")]
6060
#[test_case("scriptdata01.dat")]
@@ -64,7 +64,7 @@ const DISABLED_CASES: &[&str] = &[
6464
// #[test_case("template.dat")]
6565
#[test_case("tests_innerHTML_1.dat")]
6666
#[test_case("tricky01.dat")]
67-
// #[test_case("webkit01.dat")]
67+
#[test_case("webkit01.dat")]
6868
#[test_case("webkit02.dat")]
6969
fn tree_construction(filename: &str) {
7070
let fixture_file =

0 commit comments

Comments
 (0)