diff --git a/crates/oxc_linter/src/rules/eslint/no_control_regex.rs b/crates/oxc_linter/src/rules/eslint/no_control_regex.rs index 007103bbf25b3..4b4ae039b0891 100644 --- a/crates/oxc_linter/src/rules/eslint/no_control_regex.rs +++ b/crates/oxc_linter/src/rules/eslint/no_control_regex.rs @@ -317,7 +317,9 @@ mod tests { r"/\u{1F}/", r"/\u{1F}/g", r"new RegExp('\\u{20}', 'u')", + r"new RegExp('\\u{20}', `u`)", r"new RegExp('\\u{1F}')", + r"new RegExp(`\\u{1F}`)", r"new RegExp('\\u{1F}', 'g')", r"new RegExp('\\u{1F}', flags)", // unknown flags, we assume no 'u' // https://github.com/oxc-project/oxc/issues/6136 @@ -347,6 +349,8 @@ mod tests { r"/\u{1F}/u", r"/\u{1F}/ugi", r"new RegExp('\\u{1F}', 'u')", + r"new RegExp(`\\u{1F}`, 'u')", + r"new RegExp('\\u{1F}', `u`)", r"new RegExp('\\u{1F}', 'ugi')", // https://github.com/oxc-project/oxc/issues/6136 r"/\u{0a}/u", diff --git a/crates/oxc_linter/src/rules/eslint/no_useless_backreference.rs b/crates/oxc_linter/src/rules/eslint/no_useless_backreference.rs index 60fb64f27723d..4bc618cb51bcc 100644 --- a/crates/oxc_linter/src/rules/eslint/no_useless_backreference.rs +++ b/crates/oxc_linter/src/rules/eslint/no_useless_backreference.rs @@ -365,6 +365,7 @@ fn test() { r"new RegExp(`${prefix}\\1(a)`)", r"let RegExp; new RegExp('\\1(a)');", r"function foo() { var RegExp; RegExp('\\1(a)', 'u'); }", + r"function foo() { var RegExp; RegExp('\\1(a)', `u`); }", r"function foo(RegExp) { new RegExp('\\1(a)'); }", r"if (foo) { const RegExp = bar; RegExp('\\1(a)'); }", // we don't support globals off yet diff --git a/crates/oxc_linter/src/snapshots/eslint_no_control_regex.snap b/crates/oxc_linter/src/snapshots/eslint_no_control_regex.snap index b2a6d09412279..a91e9d0b7e91b 100644 Binary files a/crates/oxc_linter/src/snapshots/eslint_no_control_regex.snap and b/crates/oxc_linter/src/snapshots/eslint_no_control_regex.snap differ diff --git a/crates/oxc_linter/src/utils/regex.rs b/crates/oxc_linter/src/utils/regex.rs index baaf6fbae81c1..dfc377d4277ce 100644 --- a/crates/oxc_linter/src/utils/regex.rs +++ b/crates/oxc_linter/src/utils/regex.rs @@ -39,19 +39,55 @@ where let arg2 = arg2.and_then(Argument::as_expression).map(Expression::get_inner_expression); // note: improvements required for strings used via identifier references // Missing or non-string arguments will be runtime errors, but are not covered by this rule. - match (&arg1, &arg2) { + match (arg1, arg2) { (Some(Expression::StringLiteral(pattern)), Some(Expression::StringLiteral(flags))) => { let allocator = Allocator::default(); if let Some(pat) = parse_regex(&allocator, pattern.span, Some(flags.span), ctx) { cb(&pat, pattern.span); } } + (Some(Expression::StringLiteral(pattern)), Some(Expression::TemplateLiteral(flags))) => { + if !flags.is_no_substitution_template() { + return; + } + let allocator = Allocator::default(); + if let Some(pat) = parse_regex(&allocator, pattern.span, Some(flags.span), ctx) { + cb(&pat, pattern.span); + } + } (Some(Expression::StringLiteral(pattern)), _) => { let allocator = Allocator::default(); if let Some(pat) = parse_regex(&allocator, pattern.span, None, ctx) { cb(&pat, pattern.span); } } + (Some(Expression::TemplateLiteral(pattern)), Some(Expression::TemplateLiteral(flags))) => { + if !pattern.is_no_substitution_template() || !flags.is_no_substitution_template() { + return; + } + let allocator = Allocator::default(); + if let Some(pat) = parse_regex(&allocator, pattern.span, Some(flags.span), ctx) { + cb(&pat, pattern.span); + } + } + (Some(Expression::TemplateLiteral(pattern)), Some(Expression::StringLiteral(flags))) => { + if !pattern.is_no_substitution_template() { + return; + } + let allocator = Allocator::default(); + if let Some(pat) = parse_regex(&allocator, pattern.span, Some(flags.span), ctx) { + cb(&pat, pattern.span); + } + } + (Some(Expression::TemplateLiteral(pattern)), _) => { + if !pattern.is_no_substitution_template() { + return; + } + let allocator = Allocator::default(); + if let Some(pat) = parse_regex(&allocator, pattern.span, None, ctx) { + cb(&pat, pattern.span); + } + } _ => {} } } diff --git a/crates/oxc_regular_expression/src/parser/parser_impl.rs b/crates/oxc_regular_expression/src/parser/parser_impl.rs index e77c323f888ef..b21abdbbb54d0 100644 --- a/crates/oxc_regular_expression/src/parser/parser_impl.rs +++ b/crates/oxc_regular_expression/src/parser/parser_impl.rs @@ -93,7 +93,7 @@ impl<'a> ConstructorParser<'a> { (false, false) }; - let pattern_text = if matches!(self.pattern_text, r#""""# | "''") { + let pattern_text = if matches!(self.pattern_text, r#""""# | "''" | "``") { r#""(?:)""# } else { self.pattern_text diff --git a/crates/oxc_regular_expression/src/parser/reader/ast.rs b/crates/oxc_regular_expression/src/parser/reader/ast.rs new file mode 100644 index 0000000000000..7e1a6d7825c6b --- /dev/null +++ b/crates/oxc_regular_expression/src/parser/reader/ast.rs @@ -0,0 +1,10 @@ +use oxc_span::Span; + +/// Represents UTF-16 code unit(u16 as u32) or Unicode code point(char as u32). +/// `Span` width may be more than 1, since there will be escape sequences. +#[derive(Debug, Clone, Copy)] +pub struct CodePoint { + pub span: Span, + // NOTE: If we need codegen, more information should be added. + pub value: u32, +} diff --git a/crates/oxc_regular_expression/src/parser/reader/string_literal_parser/characters.rs b/crates/oxc_regular_expression/src/parser/reader/characters.rs similarity index 100% rename from crates/oxc_regular_expression/src/parser/reader/string_literal_parser/characters.rs rename to crates/oxc_regular_expression/src/parser/reader/characters.rs diff --git a/crates/oxc_regular_expression/src/parser/reader/mod.rs b/crates/oxc_regular_expression/src/parser/reader/mod.rs index a29c9a44c6f96..c3cd7675a199d 100644 --- a/crates/oxc_regular_expression/src/parser/reader/mod.rs +++ b/crates/oxc_regular_expression/src/parser/reader/mod.rs @@ -1,6 +1,12 @@ +mod ast; +mod characters; +mod options; mod reader_impl; mod string_literal_parser; +mod template_literal_parser; +pub use ast::*; +pub use options::Options; pub use reader_impl::Reader; #[cfg(test)] diff --git a/crates/oxc_regular_expression/src/parser/reader/string_literal_parser/options.rs b/crates/oxc_regular_expression/src/parser/reader/options.rs similarity index 100% rename from crates/oxc_regular_expression/src/parser/reader/string_literal_parser/options.rs rename to crates/oxc_regular_expression/src/parser/reader/options.rs diff --git a/crates/oxc_regular_expression/src/parser/reader/reader_impl.rs b/crates/oxc_regular_expression/src/parser/reader/reader_impl.rs index 1daef08d3b2eb..595cc5ac55600 100644 --- a/crates/oxc_regular_expression/src/parser/reader/reader_impl.rs +++ b/crates/oxc_regular_expression/src/parser/reader/reader_impl.rs @@ -1,14 +1,19 @@ use oxc_diagnostics::Result; use oxc_span::Atom; -use crate::parser::reader::string_literal_parser::{ - Options as StringLiteralParserOptions, Parser as StringLiteralParser, ast as StringLiteralAst, - parse_regexp_literal, +use crate::parser::reader::{ + Options, + ast::CodePoint, + string_literal_parser::{ + Parser as StringLiteralParser, ast as StringLiteralAst, parse_regexp_literal, + }, + template_literal_parser::{Parser as TemplateLiteralParser, ast as TemplateLiteralAst}, }; +#[derive(Debug)] pub struct Reader<'a> { source_text: &'a str, - units: Vec, + units: Vec, index: usize, offset: u32, } @@ -17,24 +22,37 @@ impl<'a> Reader<'a> { pub fn initialize( source_text: &'a str, unicode_mode: bool, - parse_string_literal: bool, + parse_string_or_template_literal: bool, ) -> Result { // NOTE: This must be `0`. // Since `source_text` here may be a slice of the original source text, // using `Span` for `span.source_text(source_text)` will be out of range in some cases. let span_offset = 0; - let units = if parse_string_literal { - let StringLiteralAst::StringLiteral { body, .. } = StringLiteralParser::new( - source_text, - StringLiteralParserOptions { - strict_mode: false, - span_offset, - combine_surrogate_pair: unicode_mode, - }, - ) - .parse()?; - body + let units = if parse_string_or_template_literal { + if source_text.chars().next().is_some_and(|c| c == '`') { + let TemplateLiteralAst::TemplateLiteral { body, .. } = TemplateLiteralParser::new( + source_text, + Options { + strict_mode: false, + span_offset, + combine_surrogate_pair: unicode_mode, + }, + ) + .parse()?; + body + } else { + let StringLiteralAst::StringLiteral { body, .. } = StringLiteralParser::new( + source_text, + Options { + strict_mode: false, + span_offset, + combine_surrogate_pair: unicode_mode, + }, + ) + .parse()?; + body + } } else { parse_regexp_literal(source_text, span_offset, unicode_mode) }; @@ -43,9 +61,9 @@ impl<'a> Reader<'a> { source_text, units, index: 0, - // If `parse_string_literal` is `true`, the first character is the opening quote. + // If `parse_string_or_template_literal` is `true`, the first character is the opening quote. // We need to +1 to skip it. - offset: u32::from(parse_string_literal), + offset: u32::from(parse_string_or_template_literal), }) } diff --git a/crates/oxc_regular_expression/src/parser/reader/string_literal_parser/ast.rs b/crates/oxc_regular_expression/src/parser/reader/string_literal_parser/ast.rs index a763a497ccf75..7b116309f37b2 100644 --- a/crates/oxc_regular_expression/src/parser/reader/string_literal_parser/ast.rs +++ b/crates/oxc_regular_expression/src/parser/reader/string_literal_parser/ast.rs @@ -1,5 +1,7 @@ use oxc_span::Span; +use crate::parser::reader::ast::CodePoint; + #[derive(Debug)] pub struct StringLiteral { #[allow(unused, clippy::allow_attributes)] @@ -14,12 +16,3 @@ pub enum StringLiteralKind { Double, Single, } - -/// Represents UTF-16 code unit(u16 as u32) or Unicode code point(char as u32). -/// `Span` width may be more than 1, since there will be escape sequences. -#[derive(Debug, Clone, Copy)] -pub struct CodePoint { - pub span: Span, - // NOTE: If we need codegen, more information should be added. - pub value: u32, -} diff --git a/crates/oxc_regular_expression/src/parser/reader/string_literal_parser/mod.rs b/crates/oxc_regular_expression/src/parser/reader/string_literal_parser/mod.rs index bd771c4e811af..588cc032efe63 100644 --- a/crates/oxc_regular_expression/src/parser/reader/string_literal_parser/mod.rs +++ b/crates/oxc_regular_expression/src/parser/reader/string_literal_parser/mod.rs @@ -1,15 +1,14 @@ pub mod ast; -mod characters; mod diagnostics; -mod options; mod parser_impl; -pub use options::Options; pub use parser_impl::{Parser, parse_regexp_literal}; #[cfg(test)] mod test { - use super::{Options, Parser, ast, parse_regexp_literal}; + use crate::parser::reader::Options; + + use super::{Parser, ast, parse_regexp_literal}; #[test] fn should_pass() { diff --git a/crates/oxc_regular_expression/src/parser/reader/string_literal_parser/parser_impl.rs b/crates/oxc_regular_expression/src/parser/reader/string_literal_parser/parser_impl.rs index b1ec5b7782ad2..57ed2c124ac0f 100644 --- a/crates/oxc_regular_expression/src/parser/reader/string_literal_parser/parser_impl.rs +++ b/crates/oxc_regular_expression/src/parser/reader/string_literal_parser/parser_impl.rs @@ -1,13 +1,12 @@ use oxc_diagnostics::Result; use oxc_span::Span; -use crate::parser::reader::string_literal_parser::{ - ast, +use crate::parser::reader::{ + CodePoint, Options, characters::{ CR, LF, LS, PS, is_line_terminator, is_non_escape_character, is_single_escape_character, }, - diagnostics, - options::Options, + string_literal_parser::{ast, diagnostics}, }; // Internal representation of escape sequence resolved unit in a string literal. @@ -19,7 +18,7 @@ pub fn parse_regexp_literal( source_text: &str, span_offset: u32, combine_surrogate_pair: bool, -) -> Vec { +) -> Vec { let mut body = vec![]; let mut offset = 0; @@ -50,7 +49,7 @@ pub struct Parser { impl Parser { // This is public because it is used in `parse_regexp_literal()`. pub fn handle_code_point( - body: &mut Vec, + body: &mut Vec, (offsets, cp): OffsetsAndCp, span_offset: u32, combine_surrogate_pair: bool, @@ -59,13 +58,13 @@ impl Parser { if combine_surrogate_pair || (0..=0xffff).contains(&cp) { // If the code point is in the BMP or if forced, just push it - body.push(ast::CodePoint { span, value: cp }); + body.push(CodePoint { span, value: cp }); } else { // Otherwise, split the code point into a surrogate pair, sharing the same span let (lead, trail) = (0xd800 + ((cp - 0x10000) >> 10), 0xdc00 + ((cp - 0x10000) & 0x3ff)); - body.push(ast::CodePoint { span, value: lead }); - body.push(ast::CodePoint { span, value: trail }); + body.push(CodePoint { span, value: lead }); + body.push(CodePoint { span, value: trail }); } } @@ -114,10 +113,7 @@ impl Parser { // SingleStringCharacters :: // SingleStringCharacter SingleStringCharacters[opt] // ``` - fn parse_string_characters( - &mut self, - single_or_double_quote: char, - ) -> Result> { + fn parse_string_characters(&mut self, single_or_double_quote: char) -> Result> { let mut body = vec![]; while let Some(code_point) = self.parse_string_character(single_or_double_quote)? { Parser::handle_code_point( diff --git a/crates/oxc_regular_expression/src/parser/reader/template_literal_parser/README.md b/crates/oxc_regular_expression/src/parser/reader/template_literal_parser/README.md new file mode 100644 index 0000000000000..eb3f2173e09cf --- /dev/null +++ b/crates/oxc_regular_expression/src/parser/reader/template_literal_parser/README.md @@ -0,0 +1,7 @@ +# template_literal_parser + +Implements ECMAScript® 2025 Language Specification + +- https://tc39.es/ecma262/2025/multipage/ecmascript-language-lexical-grammar.html#sec-template-literal-lexical-components + +It only support `NoSubstitutionTemplate` and returns a diagnostic when it founds `${}` inside it. diff --git a/crates/oxc_regular_expression/src/parser/reader/template_literal_parser/ast.rs b/crates/oxc_regular_expression/src/parser/reader/template_literal_parser/ast.rs new file mode 100644 index 0000000000000..71fde54ed8765 --- /dev/null +++ b/crates/oxc_regular_expression/src/parser/reader/template_literal_parser/ast.rs @@ -0,0 +1,10 @@ +use oxc_span::Span; + +use crate::parser::reader::CodePoint; + +#[derive(Debug)] +pub struct TemplateLiteral { + #[allow(unused, clippy::allow_attributes)] + pub span: Span, + pub body: Vec, +} diff --git a/crates/oxc_regular_expression/src/parser/reader/template_literal_parser/diagnostics.rs b/crates/oxc_regular_expression/src/parser/reader/template_literal_parser/diagnostics.rs new file mode 100644 index 0000000000000..c5f32a4d5caf0 --- /dev/null +++ b/crates/oxc_regular_expression/src/parser/reader/template_literal_parser/diagnostics.rs @@ -0,0 +1,31 @@ +use oxc_diagnostics::OxcDiagnostic; +use oxc_span::Span; + +#[cold] +pub fn invalid_input(span: Span) -> OxcDiagnostic { + OxcDiagnostic::error( + "Template literal should be wrapped with ` or escaped properly".to_string(), + ) + .with_label(span) +} + +#[cold] +pub fn template_substitution(span: Span) -> OxcDiagnostic { + OxcDiagnostic::error("Template literal should not contain unescaped `${}`".to_string()) + .with_label(span) +} + +#[cold] +pub fn too_large_unicode_escape_sequence(span: Span) -> OxcDiagnostic { + OxcDiagnostic::error("Too large unicode escape sequence".to_string()).with_label(span) +} + +#[cold] +pub fn invalid_hex_escape(span: Span) -> OxcDiagnostic { + OxcDiagnostic::error("Invalid hex escape sequence".to_string()).with_label(span) +} + +#[cold] +pub fn invalid_unicode_escape(span: Span) -> OxcDiagnostic { + OxcDiagnostic::error("Invalid unicode escape sequence".to_string()).with_label(span) +} diff --git a/crates/oxc_regular_expression/src/parser/reader/template_literal_parser/mod.rs b/crates/oxc_regular_expression/src/parser/reader/template_literal_parser/mod.rs new file mode 100644 index 0000000000000..3ec2ea690db19 --- /dev/null +++ b/crates/oxc_regular_expression/src/parser/reader/template_literal_parser/mod.rs @@ -0,0 +1,71 @@ +pub mod ast; +mod diagnostics; +mod parser_impl; + +pub use parser_impl::Parser; + +#[cfg(test)] +mod test { + use crate::parser::reader::{Options, template_literal_parser::parser_impl::Parser}; + + #[test] + fn should_pass() { + for source_text in [ + "``", + "`Hello, world!`", + r#"`He said, "Hello!"`"#, + r#"`She said, "Hello!"`"#, + r"`It's a sunny day`", + r"`Line1\nLine2`", + r"`Column1\tColumn2`", + r"`Path to file: C:\\Program Files\\MyApp`", + r"`Backspace\bTest`", + r"`FormFeed\fTest`", + r"`CarriageReturn\rTest`", + r"`TestWithValidDollarSignAtTheEnd$`", + r"`TestWithValid$DollarSignBetween`", + r"`VerticalTab\vTest`", + "` `", // whitespace only + r"`Escaped \``", // escaped backtick + r"`Unicode: \u0041`", // unicode escape + r"`Code point: \u{1F600}`", // unicode code point escape + r"`Hex: \x41`", // hex escape + "`Line1\\\nLine2`", // line continuation + "`Price: $100`", // dollar sign not followed by brace + r"`Smile: \uD83D\uDE00`", // surrogate pair + r"`Mix: \n\t\x41\u0042\u{43}`", // multiple escapes + r"`Dollar: \$`", // escaped dollar + r"`Null: \0`", // should be valid if not followed by a digit + r"`Surrogate: \uD800`", // lone high surrogate, not a valid code point but no error is reported + r"`Valid: \z`", + "`This is + a multi-line + template literal`", + ] { + if let Err(err) = Parser::new(source_text, Options::default()).parse() { + panic!("Expect to parse: {source_text} but failed: {err}"); + } + } + } + + #[test] + fn should_fail() { + for source_text in [ + "`Unclosed template literal", + r"`Invalid hex escape: \xG1`", + r"`Invalid unicode escape: \u{G1}`", + r"`Template with ${expression}`", // expression not supported + r"`Incomplete: \u`", + r"`Incomplete: \u{}`", + r"`Incomplete: \u{110000}`", // out of Unicode range + r"`Incomplete: \x`", + r"`Incomplete: \x4`", + "`Unescaped backtick: ``", + r"`Line continuation: \\n\xG1`", + r"`Dollar: ${{}`", // should fail, as `${` is not supported + ] { + let result = Parser::new(source_text, Options::default()).parse(); + assert!(result.is_err(), "Expect to fail: {source_text} but passed..."); + } + } +} diff --git a/crates/oxc_regular_expression/src/parser/reader/template_literal_parser/parser_impl.rs b/crates/oxc_regular_expression/src/parser/reader/template_literal_parser/parser_impl.rs new file mode 100644 index 0000000000000..6ebb80ac56945 --- /dev/null +++ b/crates/oxc_regular_expression/src/parser/reader/template_literal_parser/parser_impl.rs @@ -0,0 +1,555 @@ +use oxc_diagnostics::Result; +use oxc_span::Span; + +use crate::parser::reader::{ + Options, + ast::CodePoint, + characters::{ + CR, LF, LS, PS, is_line_terminator, is_non_escape_character, is_single_escape_character, + }, + template_literal_parser::{ast, diagnostics}, +}; + +// Internal representation of escape sequence resolved unit in a string literal. +type OffsetsAndCp = ((u32, u32), u32); + +pub struct Parser { + // NOTE: In JavaScript, template literals are UTF-16 encoded, + // so we need to be aware of surrogate pairs, while collecting offsets for `Span`. + // Rather than using `encode_utf16()`, split surrogate pairs manually is easier + // to detect the start and end of each code point. + chars: Vec, + index: usize, + offset: u32, + options: Options, +} + +impl Parser { + fn handle_code_point( + body: &mut Vec, + (offsets, cp): OffsetsAndCp, + span_offset: u32, + combine_surrogate_pair: bool, + ) { + let span = Span::new(span_offset + offsets.0, span_offset + offsets.1); + + if combine_surrogate_pair || (0..=0xffff).contains(&cp) { + // If the code point is in the BMP or if forced, just push it + body.push(CodePoint { span, value: cp }); + } else { + // Otherwise, split the code point into a surrogate pair, sharing the same span + let (lead, trail) = + (0xd800 + ((cp - 0x10000) >> 10), 0xdc00 + ((cp - 0x10000) & 0x3ff)); + body.push(CodePoint { span, value: lead }); + body.push(CodePoint { span, value: trail }); + } + } + + // --- + + pub fn new(source_text: &str, options: Options) -> Self { + Self { chars: source_text.chars().collect::>(), index: 0, offset: 0, options } + } + + // We do not parse TemplateHead, TemplateTail, TemplateSubstitutionTail, TemplateMiddle + // ``` + // Template :: + // NoSubstitutionTemplate + // TemplateHead + // + // NoSubstitutionTemplate :: + // ` TemplateCharacters[opt] ` + // + // ``` + pub fn parse(mut self) -> Result { + if !self.eat('`') { + return Err(diagnostics::invalid_input(Span::empty(self.options.span_offset))); + } + + let body = self.parse_template()?; + + if self.eat('`') { + if self.peek().is_some() { + return Err(diagnostics::invalid_input(Span::empty( + self.options.span_offset + self.offset(), + ))); + } + + let span = Span::sized(self.options.span_offset, self.offset()); + return Ok(ast::TemplateLiteral { span, body }); + } + + Err(diagnostics::invalid_input(Span::empty(self.options.span_offset + self.offset()))) + } + + // --- + + // ``` + // Template :: + // NoSubstitutionTemplate + // TemplateHead + // ``` + fn parse_template(&mut self) -> Result> { + // ToDo: diagnostic when TemplateHead is found + self.parse_template_characters() + } + + // ``` + // TemplateCharacters :: + // TemplateCharacter TemplateCharacters[opt] + // ``` + fn parse_template_characters(&mut self) -> Result> { + let mut body = vec![]; + while let Some(code_point) = self.parse_template_character()? { + Parser::handle_code_point( + &mut body, + code_point, + self.options.span_offset, + self.options.combine_surrogate_pair, + ); + } + Ok(body) + } + + // ``` + // TemplateCharacter :: + // $ [lookahead ≠ {] + // \ TemplateEscapeSequence + // \ NotEscapeSequence + // LineContinuation + // LineTerminatorSequence + // SourceCharacter but not one of ` or \ or $ or LineTerminator + // ``` + fn parse_template_character(&mut self) -> Result> { + let offset_start = self.offset(); + + // $ [lookahead ≠ {] + if self.peek() == Some('$') && self.peek2() != Some('{') { + self.advance(); + return Ok(Some(((offset_start, self.offset()), '$' as u32))); + } + + if self.eat('\\') { + if let Some(cp) = self.parse_template_escape_sequence(offset_start)? { + return Ok(Some(((offset_start, self.offset()), cp))); + } + if let Some(cp) = self.parse_not_escape_sequence()? { + return Ok(Some(((offset_start, self.offset()), cp))); + } + } + if let Some(cp) = self.parse_line_continuation() { + return Ok(Some(((offset_start, self.offset()), cp))); + } + if let Some(cp) = self.parse_line_terminator_sequence() { + return Ok(Some(((offset_start, self.offset()), cp))); + } + + if let Some(ch) = self.peek() { + if ch == '$' { + // Skip it too, but we do not support `TemplateHead ` or `TemplateMiddle`. + return Err(diagnostics::template_substitution(Span::new( + self.options.span_offset + offset_start, + self.options.span_offset + self.offset(), + ))); + } + + if ch == '\\' || ch == '`' || is_line_terminator(ch) { + return Ok(None); + } + + self.advance(); + + return Ok(Some(((offset_start, self.offset()), ch as u32))); + } + + Ok(None) + } + + // ``` + // TemplateEscapeSequence :: + // CharacterEscapeSequence + // 0 [lookahead ∉ DecimalDigit] + // HexEscapeSequence + // UnicodeEscapeSequence + // ``` + fn parse_template_escape_sequence(&mut self, offset_start: u32) -> Result> { + if let Some(cp) = self.parse_character_escape_sequence() { + return Ok(Some(cp)); + } + if self.peek() == Some('0') && self.peek2().is_none_or(|ch| !ch.is_ascii_digit()) { + self.advance(); + return Ok(Some(0x00)); + } + if let Some(cp) = self.parse_hex_escape_sequence()? { + return Ok(Some(cp)); + } + if let Some(cp) = self.parse_unicode_escape_sequence(offset_start)? { + return Ok(Some(cp)); + } + + Ok(None) + } + + // ``` + // CharacterEscapeSequence :: + // SingleEscapeCharacter + // NonEscapeCharacter + // ``` + fn parse_character_escape_sequence(&mut self) -> Option { + if let Some(ch) = self.peek().filter(|&ch| is_single_escape_character(ch)) { + self.advance(); + return Some(ch as u32); + } + if let Some(ch) = self.peek().filter(|&ch| is_non_escape_character(ch)) { + self.advance(); + return Some(ch as u32); + } + + None + } + + // ``` + // NotEscapeSequence :: + // 0 DecimalDigit + // DecimalDigit but not 0 + // x [lookahead ∉ HexDigit] + // x HexDigit [lookahead ∉ HexDigit] + // u [lookahead ∉ HexDigit] [lookahead ≠ {] + // u HexDigit [lookahead ∉ HexDigit] + // u HexDigit HexDigit [lookahead ∉ HexDigit] + // u HexDigit HexDigit HexDigit [lookahead ∉ HexDigit] + // u { [lookahead ∉ HexDigit] + // u { NotCodePoint [lookahead ∉ HexDigit] + // u { CodePoint [lookahead ∉ HexDigit] [lookahead ≠ }] + // ``` + fn parse_not_escape_sequence(&mut self) -> Result> { + let checkpoint = self.checkpoint(); + + // 0 DecimalDigit + if self.eat('0') { + if let Some(ch) = self.peek() { + if ch.is_ascii_digit() { + self.advance(); + return Ok(Some(ch as u32)); + } + } + self.rewind(checkpoint); + } + + // DecimalDigit but not 0 + if let Some(ch) = self.peek() { + if ch.is_ascii_digit() && ch != '0' { + self.advance(); + return Ok(Some(ch as u32)); + } + } + + // x [lookahead ∉ HexDigit] or x HexDigit [lookahead ∉ HexDigit] + if self.eat('x') { + let offset_start = self.offset(); + match self.consume_hex_digits(offset_start) { + Ok(Some(_)) => { + self.rewind(checkpoint); + return Ok(None); + } + Ok(None) => { + return Ok(Some('x' as u32)); + } + Err(e) => { + return Err(e); + } + } + } + + // u [lookahead ∉ HexDigit] [lookahead ≠ {] + // u HexDigit [lookahead ∉ HexDigit] + // u HexDigit HexDigit [lookahead ∉ HexDigit] + // u HexDigit HexDigit HexDigit [lookahead ∉ HexDigit] + // u { [lookahead ∉ HexDigit] + // u { NotCodePoint [lookahead ∉ HexDigit] + // u { CodePoint [lookahead ∉ HexDigit] [lookahead ≠ }] + if self.eat('u') { + let offset_start = self.offset(); + if self.eat('{') { + match self.consume_hex_digits(offset_start) { + Ok(Some(_)) => { + if !self.eat('}') { + return Ok(Some('u' as u32)); + } + self.rewind(checkpoint); + return Ok(None); + } + Ok(None) => { + return Ok(Some('u' as u32)); + } + Err(e) => { + return Err(e); + } + } + } + let mut hex_count = 0; + for _ in 0..4 { + if let Some(ch) = self.peek() { + if ch.is_ascii_hexdigit() { + self.advance(); + hex_count += 1; + } else { + break; + } + } else { + break; + } + } + if hex_count == 0 || hex_count < 4 { + return Ok(Some('u' as u32)); + } + if let Some(ch) = self.peek() { + if ch.is_ascii_hexdigit() { + self.rewind(checkpoint); + return Ok(None); + } + } + self.rewind(checkpoint); + return Ok(None); + } + + Ok(None) + } + + // ``` + // HexEscapeSequence :: + // x HexDigit HexDigit + // ``` + fn parse_hex_escape_sequence(&mut self) -> Result> { + if self.eat('x') { + let first = self.consume_hex_digit(); + let second = self.consume_hex_digit(); + if let (Some(first), Some(second)) = (first, second) { + return Ok(Some(first * 16 + second)); + } + + // Invalid hex escape: \x not followed by two hex digits + return Err(diagnostics::invalid_hex_escape(Span::new( + self.options.span_offset + self.offset(), + self.options.span_offset + self.offset(), + ))); + } + + Ok(None) + } + + // ``` + // UnicodeEscapeSequence :: + // u Hex4Digits + // u{ CodePoint } + // ``` + fn parse_unicode_escape_sequence(&mut self, offset_start: u32) -> Result> { + let checkpoint = self.checkpoint(); + + if self.eat('u') { + if let Some(cp) = self.consume_hex4_digits() { + return Ok(Some(cp)); + } else if self.peek() != Some('{') { + // If not followed by 4 hex digits or a code point escape, error + return Err(diagnostics::invalid_unicode_escape(Span::new( + self.options.span_offset + offset_start, + self.options.span_offset + self.offset(), + ))); + } + self.rewind(checkpoint); + } + + if self.eat('u') { + if self.eat('{') { + // Try to parse hex digits, error if not valid + match self.consume_hex_digits(offset_start) { + Ok(Some(hex_digits)) if hex_digits <= 0x10_ffff => { + if self.eat('}') { + return Ok(Some(hex_digits)); + } + + // Missing closing '}' + return Err(diagnostics::invalid_unicode_escape(Span::new( + self.options.span_offset + offset_start, + self.options.span_offset + self.offset(), + ))); + } + Ok(_) => { + // No valid hex digits or out of range + return Err(diagnostics::invalid_unicode_escape(Span::new( + self.options.span_offset + offset_start, + self.options.span_offset + self.offset(), + ))); + } + Err(e) => return Err(e), + } + } + self.rewind(checkpoint); + } + + Ok(None) + } + + // ``` + // LineContinuation :: + // \ LineTerminatorSequence + // + // ``` + fn parse_line_continuation(&mut self) -> Option { + let checkpoint = self.checkpoint(); + + if self.eat('\\') { + if let Some(terminator) = self.parse_line_terminator_sequence() { + return Some(terminator); + } + } + + self.rewind(checkpoint); + None + } + + // ``` + // LineTerminatorSequence :: + // + // [lookahead ≠ ] + // + // + // + // ``` + fn parse_line_terminator_sequence(&mut self) -> Option { + let checkpoint = self.checkpoint(); + + if self.peek() == Some(LF) { + self.advance(); + return Some(LF as u32); + } + if self.peek() == Some(CR) && self.peek2() != Some(LF) { + self.advance(); + return Some(CR as u32); + } + if self.peek() == Some(LS) { + self.advance(); + return Some(LS as u32); + } + if self.peek() == Some(PS) { + self.advance(); + return Some(PS as u32); + } + // NOTE: CR+LF can not represent as a single code point. + // I don't know the best way to handle this. + // To distinguish this from CR and LF, structural change is needed... + if self.peek() == Some(CR) && self.peek2() == Some(LF) { + self.advance(); + self.advance(); + return Some(LF as u32); + } + + self.rewind(checkpoint); + None + } + + // --- + + fn consume_hex_digit(&mut self) -> Option { + if let Some(ch) = self.peek().filter(char::is_ascii_hexdigit) { + self.advance(); + return ch.to_digit(16); + } + + None + } + + // ``` + // Hex4Digits :: + // HexDigit HexDigit HexDigit HexDigit + // ``` + fn consume_hex4_digits(&mut self) -> Option { + let checkpoint = self.checkpoint(); + + let mut value = 0; + for _ in 0..4 { + let Some(hex) = + self.peek().filter(char::is_ascii_hexdigit).and_then(|ch| ch.to_digit(16)) + else { + self.rewind(checkpoint); + return None; + }; + + value = (16 * value) + hex; + self.advance(); + } + + Some(value) + } + + fn consume_hex_digits(&mut self, offset_start: u32) -> Result> { + let checkpoint = self.checkpoint(); + + let mut value: u32 = 0; + while let Some(hex) = + self.peek().filter(char::is_ascii_hexdigit).and_then(|ch| ch.to_digit(16)) + { + // To prevent panic on overflow cases like `\u{FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF}` + if let Some(v) = value.checked_mul(16).and_then(|v| v.checked_add(hex)) { + value = v; + self.advance(); + } else { + return Err(diagnostics::too_large_unicode_escape_sequence(Span::new( + self.options.span_offset + offset_start, + self.options.span_offset + self.offset(), + ))); + } + } + + if self.checkpoint() != checkpoint { + return Ok(Some(value)); + } + + Ok(None) + } + + // --- + + fn checkpoint(&self) -> (usize, u32) { + (self.index, self.offset) + } + + fn rewind(&mut self, checkpoint: (usize, u32)) { + self.index = checkpoint.0; + self.offset = checkpoint.1; + } + + fn advance(&mut self) { + if let Some(ch) = self.chars.get(self.index) { + #[expect(clippy::cast_possible_truncation)] + let len = ch.len_utf8() as u32; + self.offset += len; + self.index += 1; + } + } + + fn eat(&mut self, ch: char) -> bool { + if self.peek() == Some(ch) { + self.advance(); + return true; + } + false + } + + fn offset(&self) -> u32 { + self.offset + } + + fn peek_nth(&self, n: usize) -> Option { + let nth = self.index + n; + self.chars.get(nth).copied() + } + + fn peek(&self) -> Option { + self.peek_nth(0) + } + + fn peek2(&self) -> Option { + self.peek_nth(1) + } +}