Added parserdata structure to tokenizer

jaytaph · jaytaph · commit 9bf6eb34eebc · 2023-11-05T12:10:56.000+01:00
This parserdata structure is a gateway between the tokenizer and the
parser. In a certain case (just one), the tokenizer needs to know the
state of the parser to generate a correct token. The current setup has
the tokenizer and parser in such a way, that we cannot easily reference
eachother without borrow/check issues.

THerefor we add this "hack", which finds out the data beforehand, and
calls the tokenizer with this data. This means the call is done for each
tokenizer call, instead of only when needed, but it saves a big refactor
of the tokenizer/parser.

In the future, we probably should separate the tokenizer, parser, and
tree builder/sink structure so this is not an issue anymore.
diff --git a/src/bin/parser-test.rs b/src/bin/parser-test.rs
@@ -29,7 +29,7 @@ fn main() {
         tests_failed: Vec::new(),
     };
 
-    let filenames = Some(&["tests2.dat"][..]);
+    let filenames = Some(&["plain-text-unsafe.dat"][..]);
     let fixtures = read_fixtures(filenames).expect("fixtures");
 
     for fixture_file in fixtures {
@@ -41,9 +41,9 @@ fn main() {
 
         let mut test_idx = 1;
         for test in fixture_file.tests {
-            // if test_idx == 57 {
-            run_test(test_idx, test, &mut results);
-            // }
+            if test_idx == 11 {
+                run_test(test_idx, test, &mut results);
+            }
             test_idx += 1;
         }
 
diff --git a/src/html5/parser.rs b/src/html5/parser.rs
@@ -17,7 +17,7 @@ use crate::html5::parser::document::{Document, DocumentBuilder, DocumentFragment
 use crate::html5::parser::quirks::QuirksMode;
 use crate::html5::tokenizer::state::State;
 use crate::html5::tokenizer::token::Token;
-use crate::html5::tokenizer::{Tokenizer, CHAR_NUL, CHAR_REPLACEMENT};
+use crate::html5::tokenizer::{ParserData, Tokenizer, CHAR_NUL, CHAR_REPLACEMENT};
 use crate::types::{ParseError, Result};
 use alloc::rc::Rc;
 use core::cell::RefCell;
@@ -3752,13 +3752,27 @@ impl<'chars> Html5Parser<'chars> {
         }
     }
 
+    fn parser_data(&self) -> ParserData {
+        let namespace = self
+            .get_adjusted_current_node()
+            .namespace
+            .unwrap_or_default();
+
+        ParserData {
+            adjusted_node_namespace: namespace,
+        }
+    }
+
     /// Fetches the next token from the tokenizer. However, if the token is a text token AND
     /// it starts with one or more whitespaces, the token is split into 2 tokens: the whitespace part
     /// and the remainder.
     fn fetch_next_token(&mut self) -> Token {
         // If there are no tokens to fetch, fetch the next token from the tokenizer
         if self.token_queue.is_empty() {
-            let token = self.tokenizer.next_token().expect("tokenizer error");
+            let token = self
+                .tokenizer
+                .next_token(self.parser_data())
+                .expect("tokenizer error");
 
             if let Token::Text(value) = token {
                 for c in value.chars() {
diff --git a/src/html5/tokenizer.rs b/src/html5/tokenizer.rs
@@ -8,6 +8,7 @@ use crate::bytes::Bytes::{self, *};
 use crate::bytes::SeekMode::SeekCur;
 use crate::bytes::{CharIterator, Position};
 use crate::html5::error_logger::{ErrorLogger, ParserError};
+use crate::html5::node::HTML_NAMESPACE;
 use crate::html5::tokenizer::state::State;
 use crate::html5::tokenizer::token::Token;
 use crate::types::{Error, Result};
@@ -50,6 +51,21 @@ pub struct Tokenizer<'stream> {
     pub error_logger: Rc<RefCell<ErrorLogger>>,
 }
 
+/// This struct is a gateway between the parser and the tokenizer. It holds data that can be needed
+/// by the tokenizer in certain cases. See https://github.com/gosub-browser/gosub-engine/issues/230 for
+/// more information and how we should refactor this properly.
+pub struct ParserData {
+    pub adjusted_node_namespace: String,
+}
+
+impl Default for ParserData {
+    fn default() -> Self {
+        ParserData {
+            adjusted_node_namespace: HTML_NAMESPACE.to_string(),
+        }
+    }
+}
+
 /// Options that can be passed to the tokenizer. Mostly needed when dealing with tests.
 pub struct Options {
     /// Sets the initial state of the tokenizer. Normally only needed when dealing with tests
@@ -103,8 +119,8 @@ impl<'stream> Tokenizer<'stream> {
     }
 
     /// Retrieves the next token from the input stream or Token::EOF when the end is reached
-    pub fn next_token(&mut self) -> Result<Token> {
-        self.consume_stream()?;
+    pub fn next_token(&mut self, parser_data: ParserData) -> Result<Token> {
+        self.consume_stream(parser_data)?;
 
         if self.token_queue.is_empty() {
             return Ok(Token::Eof);
@@ -124,7 +140,7 @@ impl<'stream> Tokenizer<'stream> {
     }
 
     /// Consumes the input stream. Continues until the stream is completed or a token has been generated.
-    fn consume_stream(&mut self) -> Result<()> {
+    fn consume_stream(&mut self, parser_data: ParserData) -> Result<()> {
         loop {
             // Something is already in the token buffer, so we can return it.
             if !self.token_queue.is_empty() {
@@ -1210,9 +1226,11 @@ impl<'stream> Tokenizer<'stream> {
                     if self.chars.look_ahead_slice(7) == "[CDATA[" {
                         self.chars.seek(SeekCur, 7);
 
-                        // @TODO: If there is an adjusted current node and it is not an element in the HTML namespace,
-                        // then switch to the CDATA section state. Otherwise, this is a cdata-in-html-content parse error.
-                        // Create a comment token whose data is the "[CDATA[" string. Switch to the bogus comment state.
+                        if parser_data.adjusted_node_namespace != HTML_NAMESPACE {
+                            self.state = State::CDATASection;
+                            continue;
+                        }
+
                         self.parse_error(ParserError::CdataInHtmlContent);
                         self.current_token = Some(Token::Comment("[CDATA[".into()));
 
diff --git a/src/html5/tokenizer/character_reference.rs b/src/html5/tokenizer/character_reference.rs
@@ -6,6 +6,7 @@ use crate::bytes::{
 };
 use crate::html5::error_logger::ParserError;
 use crate::html5::tokenizer::replacement_tables::{TOKEN_NAMED_CHARS, TOKEN_REPLACEMENTS};
+
 use crate::html5::tokenizer::{Tokenizer, CHAR_REPLACEMENT};
 use lazy_static::lazy_static;
 
@@ -350,6 +351,7 @@ lazy_static! {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::html5::tokenizer::ParserData;
     use crate::{bytes::CharIterator, html5::error_logger::ErrorLogger};
     use std::cell::RefCell;
     use std::rc::Rc;
@@ -367,7 +369,7 @@ mod tests {
                     let error_logger = Rc::new(RefCell::new(ErrorLogger::new()));
                     let mut tokenizer = Tokenizer::new(&mut chars, None, error_logger.clone());
 
-                    let token = tokenizer.next_token().unwrap();
+                    let token = tokenizer.next_token(ParserData::default()).unwrap();
                     assert_eq!(expected, token.to_string());
                 }
             )*
diff --git a/src/testing/tokenizer.rs b/src/testing/tokenizer.rs
@@ -1,5 +1,6 @@
 use super::FIXTURE_ROOT;
 use crate::bytes::CharIterator;
+use crate::html5::tokenizer::ParserData;
 use crate::html5::{
     error_logger::ErrorLogger,
     tokenizer::{
@@ -211,12 +212,12 @@ impl TestSpec {
             // If there is no output, still do an (initial) next token so the parser can generate
             // errors.
             if self.output.is_empty() {
-                tokenizer.next_token().unwrap();
+                tokenizer.next_token(ParserData::default()).unwrap();
             }
 
             // There can be multiple tokens to match. Make sure we match all of them
             for expected in self.output.iter() {
-                let actual = tokenizer.next_token().unwrap();
+                let actual = tokenizer.next_token(ParserData::default()).unwrap();
                 assert_eq!(self.escape(&actual), self.escape(expected));
             }
 
@@ -237,7 +238,7 @@ impl TestSpec {
             let mut tokenizer = builder.build();
 
             for _ in self.output.iter() {
-                tokenizer.next_token().unwrap();
+                tokenizer.next_token(ParserData::default()).unwrap();
             }
         }
     }
diff --git a/tests/tree_construction.rs b/tests/tree_construction.rs
@@ -15,7 +15,7 @@ const DISABLED_CASES: &[&str] = &[
 #[test_case("tests3.dat")]
 #[test_case("tests4.dat")]
 #[test_case("tests5.dat")]
-// #[test_case("tests6.dat")]
+#[test_case("tests6.dat")]
 #[test_case("tests7.dat")]
 #[test_case("tests8.dat")]
 #[test_case("tests9.dat")]
@@ -27,9 +27,9 @@ const DISABLED_CASES: &[&str] = &[
 // #[test_case("tests16.dat")]
 #[test_case("tests17.dat")]
 #[test_case("tests18.dat")]
-// #[test_case("tests19.dat")]
+#[test_case("tests19.dat")]
 #[test_case("tests20.dat")]
-// #[test_case("tests21.dat")]
+#[test_case("tests21.dat")]
 #[test_case("tests22.dat")]
 #[test_case("tests23.dat")]
 #[test_case("tests24.dat")]
@@ -40,11 +40,11 @@ const DISABLED_CASES: &[&str] = &[
 #[test_case("blocks.dat")]
 #[test_case("comments01.dat")]
 #[test_case("doctype01.dat")]
-// #[test_case("domjs-unsafe.dat")]
+#[test_case("domjs-unsafe.dat")]
 #[test_case("entities01.dat")]
 #[test_case("entities02.dat")]
 #[test_case("foreign-fragment.dat")]
-#[test_case("html5test-com.dat")]
+// #[test_case("html5test-com.dat")]
 #[test_case("inbody01.dat")]
 #[test_case("isindex.dat")]
 #[test_case("main-element.dat")]
@@ -54,7 +54,7 @@ const DISABLED_CASES: &[&str] = &[
 #[test_case("noscript01.dat")]
 #[test_case("pending-spec-changes.dat")]
 #[test_case("pending-spec-changes-plain-text-unsafe.dat")]
-// #[test_case("plain-text-unsafe.dat")]
+#[test_case("plain-text-unsafe.dat")]
 #[test_case("quirks01.dat")]
 #[test_case("ruby.dat")]
 #[test_case("scriptdata01.dat")]
@@ -64,7 +64,7 @@ const DISABLED_CASES: &[&str] = &[
 // #[test_case("template.dat")]
 #[test_case("tests_innerHTML_1.dat")]
 #[test_case("tricky01.dat")]
-// #[test_case("webkit01.dat")]
+#[test_case("webkit01.dat")]
 #[test_case("webkit02.dat")]
 fn tree_construction(filename: &str) {
     let fixture_file =

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,6 @@`
`1`	`1`	`use super::FIXTURE_ROOT;`
`2`	`2`	`use crate::bytes::CharIterator;`
	`3`	`+use crate::html5::tokenizer::ParserData;`
`3`	`4`	`use crate::html5::{`
`4`	`5`	`error_logger::ErrorLogger,`
`5`	`6`	`tokenizer::{`
`@@ -211,12 +212,12 @@ impl TestSpec {`
`211`	`212`	`// If there is no output, still do an (initial) next token so the parser can generate`
`212`	`213`	`// errors.`
`213`	`214`	`if self.output.is_empty() {`
`214`		`- tokenizer.next_token().unwrap();`
	`215`	`+ tokenizer.next_token(ParserData::default()).unwrap();`
`215`	`216`	`}`
`216`	`217`
`217`	`218`	`// There can be multiple tokens to match. Make sure we match all of them`
`218`	`219`	`for expected in self.output.iter() {`
`219`		`- let actual = tokenizer.next_token().unwrap();`
	`220`	`+ let actual = tokenizer.next_token(ParserData::default()).unwrap();`
`220`	`221`	`assert_eq!(self.escape(&actual), self.escape(expected));`
`221`	`222`	`}`
`222`	`223`
`@@ -237,7 +238,7 @@ impl TestSpec {`
`237`	`238`	`let mut tokenizer = builder.build();`
`238`	`239`
`239`	`240`	`for _ in self.output.iter() {`
`240`		`- tokenizer.next_token().unwrap();`
	`241`	`+ tokenizer.next_token(ParserData::default()).unwrap();`
`241`	`242`	`}`
`242`	`243`	`}`
`243`	`244`	`}`