From 242212d462535eb76dc82ccda6565253f14d8f7d Mon Sep 17 00:00:00 2001 From: IWANABETHATGUY Date: Sun, 10 Dec 2023 23:31:38 +0800 Subject: [PATCH 01/19] =?UTF-8?q?chore:=20=F0=9F=A4=96=20ckpoint?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Cargo.lock | 1 + crates/oxc_js_regex/Cargo.toml | 1 + crates/oxc_js_regex/src/ast.rs | 54 +++---- crates/oxc_js_regex/src/ast_builder.rs | 73 ++++++++++ crates/oxc_js_regex/src/ast_kind.rs | 25 ++++ crates/oxc_js_regex/src/ecma_version.rs | 22 +++ crates/oxc_js_regex/src/lib.rs | 3 + crates/oxc_js_regex/src/parser.rs | 178 ++++++++++++++++++++++++ 8 files changed, 331 insertions(+), 26 deletions(-) create mode 100644 crates/oxc_js_regex/src/ast_builder.rs create mode 100644 crates/oxc_js_regex/src/ast_kind.rs create mode 100644 crates/oxc_js_regex/src/ecma_version.rs diff --git a/Cargo.lock b/Cargo.lock index 3b11ab893b40c..b054d73d70750 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1600,6 +1600,7 @@ name = "oxc_js_regex" version = "0.0.0" dependencies = [ "oxc_allocator", + "oxc_diagnostics", "oxc_span", ] diff --git a/crates/oxc_js_regex/Cargo.toml b/crates/oxc_js_regex/Cargo.toml index 38dcac20c5f06..3f65315f2e836 100644 --- a/crates/oxc_js_regex/Cargo.toml +++ b/crates/oxc_js_regex/Cargo.toml @@ -21,3 +21,4 @@ doctest = false [dependencies] oxc_allocator = { workspace = true } oxc_span = { workspace = true } +oxc_diagnostics = { workspace = true } diff --git a/crates/oxc_js_regex/src/ast.rs b/crates/oxc_js_regex/src/ast.rs index 6690c5f7386fc..47ea3e203f3eb 100644 --- a/crates/oxc_js_regex/src/ast.rs +++ b/crates/oxc_js_regex/src/ast.rs @@ -3,6 +3,8 @@ use oxc_allocator::{Box, Vec}; use oxc_span::{Atom, Span}; +use crate::ast_kind::AstKind; + /// The type which includes all nodes. #[derive(Debug)] pub enum Node<'a> { @@ -42,46 +44,46 @@ pub enum Leaf<'a> { /// The type which includes all atom nodes. #[derive(Debug)] pub enum Element<'a> { - Assertion(Box<'a, Assertion<'a>>), - QuantifiableElement(Box<'a, QuantifiableElement<'a>>), - Quantifier(Box<'a, Quantifier<'a>>), + Assertion(Assertion<'a>), + QuantifiableElement(QuantifiableElement<'a>), + Quantifier(Quantifier<'a>), } /// The type which includes all atom nodes that Quantifier node can have as children. #[derive(Debug)] pub enum QuantifiableElement<'a> { - Backreference(Box<'a, Backreference<'a>>), - CapturingGroup(Box<'a, CapturingGroup<'a>>), - Character(Box<'a, Character>), - CharacterClass(Box<'a, CharacterClass<'a>>), - CharacterSet(Box<'a, CharacterSet<'a>>), - ExpressionCharacterClass(Box<'a, ExpressionCharacterClass<'a>>), - Group(Box<'a, Group<'a>>), - LookaheadAssertion(Box<'a, LookaheadAssertion<'a>>), + Backreference(Backreference<'a>), + CapturingGroup(CapturingGroup<'a>), + Character(Character), + CharacterClass(CharacterClass<'a>), + CharacterSet(CharacterSet<'a>), + ExpressionCharacterClass(ExpressionCharacterClass<'a>), + Group(Group<'a>), + LookaheadAssertion(LookaheadAssertion<'a>), } /// The type which includes all character class atom nodes. #[derive(Debug)] pub enum CharacterClassElement<'a> { - ClassRangesCharacterClassElement(Box<'a, ClassRangesCharacterClassElement<'a>>), - UnicodeSetsCharacterClassElement(Box<'a, UnicodeSetsCharacterClassElement<'a>>), + ClassRangesCharacterClassElement(ClassRangesCharacterClassElement), + UnicodeSetsCharacterClassElement(UnicodeSetsCharacterClassElement<'a>), } #[derive(Debug)] -pub enum ClassRangesCharacterClassElement<'a> { - Character(Box<'a, Character>), - CharacterClassRange(Box<'a, CharacterClassRange>), - CharacterUnicodePropertyCharacterSet(Box<'a, CharacterUnicodePropertyCharacterSet>), - EscapeCharacterSet(Box<'a, EscapeCharacterSet>), +pub enum ClassRangesCharacterClassElement { + Character(Character), + CharacterClassRange(CharacterClassRange), + CharacterUnicodePropertyCharacterSet(CharacterUnicodePropertyCharacterSet), + EscapeCharacterSet(EscapeCharacterSet), } #[derive(Debug)] pub enum UnicodeSetsCharacterClassElement<'a> { - Character(Box<'a, Character>), - CharacterClassRange(Box<'a, CharacterClassRange>), - ClassStringDisjunction(Box<'a, ClassStringDisjunction<'a>>), - EscapeCharacterSet(Box<'a, EscapeCharacterSet>), - ExpressionCharacterClass(Box<'a, ExpressionCharacterClass<'a>>), - UnicodePropertyCharacterSet(Box<'a, UnicodePropertyCharacterSet<'a>>), - UnicodeSetsCharacterClass(Box<'a, UnicodeSetsCharacterClass<'a>>), + Character(Character), + CharacterClassRange(CharacterClassRange), + ClassStringDisjunction(ClassStringDisjunction<'a>), + EscapeCharacterSet(EscapeCharacterSet), + ExpressionCharacterClass(ExpressionCharacterClass<'a>), + UnicodePropertyCharacterSet(UnicodePropertyCharacterSet<'a>), + UnicodeSetsCharacterClass(UnicodeSetsCharacterClass<'a>), } /// The root node. @@ -176,7 +178,7 @@ pub enum CharacterClass<'a> { pub struct ClassRangesCharacterClass<'a> { pub span: Span, pub unicode_sets: bool, - pub elements: Vec<'a, ClassRangesCharacterClassElement<'a>>, + pub elements: Vec<'a, ClassRangesCharacterClassElement>, } /// The character class used in Unicode sets mode (`v` flag). diff --git a/crates/oxc_js_regex/src/ast_builder.rs b/crates/oxc_js_regex/src/ast_builder.rs new file mode 100644 index 0000000000000..c94434cdf8a06 --- /dev/null +++ b/crates/oxc_js_regex/src/ast_builder.rs @@ -0,0 +1,73 @@ +use oxc_allocator::{Allocator, Box, String, Vec}; +use oxc_span::{Atom, GetSpan, SourceType, Span}; + +#[allow(clippy::wildcard_imports)] +use crate::ast::*; + +/// AST builder for creating AST nodes +pub struct AstBuilder<'a> { + pub allocator: &'a Allocator, +} + +impl<'a> AstBuilder<'a> { + pub fn new(allocator: &'a Allocator) -> Self { + Self { allocator } + } + + #[inline] + pub fn alloc(&self, value: T) -> Box<'a, T> { + Box(self.allocator.alloc(value)) + } + + #[inline] + pub fn new_vec(&self) -> Vec<'a, T> { + Vec::new_in(self.allocator) + } + + #[inline] + pub fn new_vec_with_capacity(&self, capacity: usize) -> Vec<'a, T> { + Vec::with_capacity_in(capacity, self.allocator) + } + + #[inline] + pub fn new_vec_single(&self, value: T) -> Vec<'a, T> { + let mut vec = self.new_vec_with_capacity(1); + vec.push(value); + vec + } + + #[inline] + pub fn new_str(&self, value: &str) -> &'a str { + String::from_str_in(value, self.allocator).into_bump_str() + } + + pub fn copy(&self, src: &T) -> T { + // SAFETY: + // This should be safe as long as `src` is an reference from the allocator. + // But honestly, I'm not really sure if this is safe. + unsafe { std::mem::transmute_copy(src) } + } + + pub fn alternative(&mut self, span: Span, elements: Vec<'a, Element<'a>>) -> Branch<'a> { + Branch::Alternative(self.alloc(Alternative { span, elements })) + } + + pub fn capturing_group( + &mut self, + span: Span, + name: Option, + alternatives: Vec<'a, Alternative<'a>>, + references: Vec<'a, Backreference<'a>>, + ) -> Branch<'a> { + Branch::CapturingGroup(self.alloc(CapturingGroup { span, name, alternatives, references })) + } + + pub fn reg_exp_literal( + &mut self, + span: Span, + flags: Flags, + pattern: Pattern<'a>, + ) -> RegExpLiteral<'a> { + RegExpLiteral { span, pattern, flags } + } +} diff --git a/crates/oxc_js_regex/src/ast_kind.rs b/crates/oxc_js_regex/src/ast_kind.rs new file mode 100644 index 0000000000000..97eb10619a408 --- /dev/null +++ b/crates/oxc_js_regex/src/ast_kind.rs @@ -0,0 +1,25 @@ +use super::ast::*; + +#[allow(unused)] +#[derive(Debug)] +pub enum AstKind<'a> { + Alternative(&'a Alternative<'a>), + CapturingGroup(&'a CapturingGroup<'a>), + CharacterClass(&'a CharacterClass<'a>), + CharacterClassRange(&'a CharacterClassRange), + ClassIntersection(&'a ClassIntersection<'a>), + ClassStringDisjunction(&'a ClassStringDisjunction<'a>), + ClassSubtraction(&'a ClassSubtraction<'a>), + ExpressionCharacterClass(&'a ExpressionCharacterClass<'a>), + Group(&'a Group<'a>), + LookaroundAssertion(&'a LookaroundAssertion<'a>), + Pattern(&'a Pattern<'a>), + Quantifier(&'a Quantifier<'a>), + RegExpLiteral(&'a RegExpLiteral<'a>), + StringAlternative(&'a StringAlternative<'a>), + Backreference(&'a Backreference<'a>), + BoundaryAssertion(&'a BoundaryAssertion<'a>), + Character(&'a Character), + CharacterSet(&'a CharacterSet<'a>), + Flags(&'a Flags), +} diff --git a/crates/oxc_js_regex/src/ecma_version.rs b/crates/oxc_js_regex/src/ecma_version.rs new file mode 100644 index 0000000000000..f71e61cd125d7 --- /dev/null +++ b/crates/oxc_js_regex/src/ecma_version.rs @@ -0,0 +1,22 @@ +#[allow(unused)] +#[derive(Clone, Copy, PartialEq, PartialOrd, Default)] +pub enum EcmaVersion { + #[default] + V5, + V2015, + V2016, + V2017, + V2018, + V2019, + V2020, + V2021, + V2022, + V2023, + V2024, +} +#[allow(unused)] +impl EcmaVersion { + pub fn latest_ecma_version() -> Self { + Self::V2024 + } +} diff --git a/crates/oxc_js_regex/src/lib.rs b/crates/oxc_js_regex/src/lib.rs index 6647fb03be8f5..515c301327a72 100644 --- a/crates/oxc_js_regex/src/lib.rs +++ b/crates/oxc_js_regex/src/lib.rs @@ -1,4 +1,7 @@ pub mod ast; +mod ast_builder; +mod ast_kind; +mod ecma_version; mod lexer; pub mod parser; pub mod validator; diff --git a/crates/oxc_js_regex/src/parser.rs b/crates/oxc_js_regex/src/parser.rs index 8b137891791fe..f48998a39de0e 100644 --- a/crates/oxc_js_regex/src/parser.rs +++ b/crates/oxc_js_regex/src/parser.rs @@ -1 +1,179 @@ +use std::collections::{HashSet, VecDeque}; +use std::iter::Peekable; +use std::ops::Range; +use std::str::{CharIndices, Chars, Matches}; +use oxc_allocator::Allocator; +use oxc_diagnostics::Error; + +use crate::ast::{Branch, Pattern, RegExpLiteral}; +use crate::ecma_version::EcmaVersion; + +pub struct Lexer<'a> { + allocator: &'a Allocator, + + source: &'a str, + /// Regex usually, use a collected `Vec` could reduce lookahead and other util function implementation complexity + chars: Vec, + + pub(crate) errors: Vec, +} + +#[allow(clippy::unused_self)] +impl<'a> Lexer<'a> { + pub fn new(allocator: &'a Allocator, source: &'a str) -> Self { + Self { source, allocator, errors: vec![], chars: source.chars().collect::>() } + } +} + +pub struct Parser<'a> { + lexer: Lexer<'a>, + + /// Source Code + source_text: &'a str, + + /// All syntax errors from parser and lexer + /// Note: favor adding to `Diagnostics` instead of raising Err + errors: Vec, + context: ParserContext, + index: usize, + group_names: HashSet, +numCapturingParens: usize +} + +#[derive(Default, Copy, Clone)] +struct ParserContext { + source_kind: SourceKind, + unicode_mode: bool, + nflag: bool, + unicode_sets_mode: bool, + ecma_version: EcmaVersion, +} + +impl<'a> Parser<'a> { + /// Create a new parser + pub fn new(allocator: &'a Allocator, source_text: &'a str) -> Self { + Self { + lexer: Lexer::new(allocator, source_text), + source_text, + errors: vec![], + context: ParserContext::default(), + index: 0, + group_names: HashSet::new(), + } + } + + pub fn eat(&self, ch: char) -> bool { + self.lexer.chars.get(self.index) == Some(&ch) + } + + pub fn nth(&self, n: usize) -> Option<&char> { + self.lexer.chars.get(self.index + n) + } + + /// by default next means `next_1` + pub fn next(&self) -> Option<&char> { + self.lexer.chars.get(self.index + 1) + } + + /// get a range chars relative from current cursor + pub fn nrange(&self, range: Range) -> Option<&[char]> { + self.lexer.chars.get(self.index + range.start..(self.index + range.end)) + } + + pub fn current(&self) -> Option<&char> { + self.lexer.chars.get(self.index) + } + + pub fn advance(&mut self) -> bool { + if self.index < self.lexer.chars.len() { + self.index += 1; + return true; + } else { + false + } + } + + pub fn rewind(&mut self, start: usize) { + self.index = start; + } +} + +#[derive(Default, Clone, Copy)] +pub enum SourceKind { + Flags, + #[default] + Literal, + Pattern, +} + +pub fn parse_literal<'a>(parser: &mut Parser<'a>) -> RegExpLiteral<'a> { + if parser.eat('/') { + parser.advance(); + let pattern = parse_pattern(parser); + todo!() + } else if parser.source_text.is_empty() { + panic!("Empty") + } else { + match parser.current() { + Some(ch) => { + panic!("unexpected character {ch}") + } + None => { + panic!("unexpected eof") + } + }; + } +} + +fn parse_pattern<'a>(parser: &mut Parser<'a>) -> Pattern<'a> { + let start = parser.index; + if let Some(pattern) = parse_pattern_internal(parser) { + return pattern; + } else if !parser.context.nflag + && parser.context.ecma_version >= EcmaVersion::V2018 + && parser.group_names.len() > 0 + { + parser.rewind(start); + parser.context.nflag = true; + return parse_pattern_internal(parser).expect("should have pattern"); + } + panic!("Invalid pattern") +} + +fn parse_pattern_internal<'a>(parser: &mut Parser<'a>) -> Option> { + let start = parser.index; + let + todo!() +} + +fn count_capturing_parens<'a>(parser: &mut Parser<'a>) -> usize { + let start = parser.index; + let mut in_class = false; + let mut escaped = false; + let count = 0; + while let Some(ch) = parser.current() { + if escaped { + escaped = false; + } + match ch { + '\\' => { + escaped = true; + } + '[' | ']' => { + in_class = false; + } + '(' if !in_class => { + if parser.next() != Some(&'?') + || (parser.nth(2) == Some(&'<') && !matches!(parser.nth(3), '=' | '!')) + { + count += 1; + } + } + _ => {} + } + parser.advance(); + } + parser.rewind(start); + count +} From 716f03532daf11b321899489bcacc5f418094466 Mon Sep 17 00:00:00 2001 From: IWANABETHATGUY Date: Thu, 21 Dec 2023 01:23:17 +0800 Subject: [PATCH 02/19] =?UTF-8?q?chore:=20=F0=9F=A4=96=20ck=20point?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/oxc_js_regex/src/ast.rs | 6 +- crates/oxc_js_regex/src/parser.rs | 149 ++++++++++++++++++++++++++++-- 2 files changed, 146 insertions(+), 9 deletions(-) diff --git a/crates/oxc_js_regex/src/ast.rs b/crates/oxc_js_regex/src/ast.rs index 47ea3e203f3eb..0650f3053f4bc 100644 --- a/crates/oxc_js_regex/src/ast.rs +++ b/crates/oxc_js_regex/src/ast.rs @@ -157,8 +157,10 @@ pub struct LookbehindAssertion<'a> { #[derive(Debug)] pub struct Quantifier<'a> { pub span: Span, - pub min: f64, - pub max: f64, // can be f64::INFINITY + /// https://github.com/eslint-community/regexpp/blob/2e8f1af992fb12eae46a446253e8fa3f6cede92a/src/validator.ts#L384-L398 + /// both `min` and `max` are integer + pub min: usize, + pub max: usize, pub greedy: bool, pub element: QuantifiableElement<'a>, } diff --git a/crates/oxc_js_regex/src/parser.rs b/crates/oxc_js_regex/src/parser.rs index f48998a39de0e..7dc0e04253d6f 100644 --- a/crates/oxc_js_regex/src/parser.rs +++ b/crates/oxc_js_regex/src/parser.rs @@ -1,12 +1,18 @@ use std::collections::{HashSet, VecDeque}; use std::iter::Peekable; use std::ops::Range; +use std::os::unix::fs::OpenOptionsExt; +use std::panic; use std::str::{CharIndices, Chars, Matches}; use oxc_allocator::Allocator; use oxc_diagnostics::Error; +use oxc_span::Span; -use crate::ast::{Branch, Pattern, RegExpLiteral}; +use crate::ast::{ + Alternative, Assertion, Branch, Character, Element, Pattern, QuantifiableElement, Quantifier, + RegExpLiteral, +}; use crate::ecma_version::EcmaVersion; pub struct Lexer<'a> { @@ -38,7 +44,10 @@ pub struct Parser<'a> { context: ParserContext, index: usize, group_names: HashSet, -numCapturingParens: usize + num_capturing_parens: usize, + last_int_value: usize, + back_reference_names: HashSet, + last_range: Range, } #[derive(Default, Copy, Clone)] @@ -48,6 +57,7 @@ struct ParserContext { nflag: bool, unicode_sets_mode: bool, ecma_version: EcmaVersion, + strict: bool, } impl<'a> Parser<'a> { @@ -60,13 +70,30 @@ impl<'a> Parser<'a> { context: ParserContext::default(), index: 0, group_names: HashSet::new(), + num_capturing_parens: 0, + back_reference_names: HashSet::new(), + last_int_value: 0, + last_range: 0..0, } } - pub fn eat(&self, ch: char) -> bool { + pub fn is(&self, ch: char) -> bool { self.lexer.chars.get(self.index) == Some(&ch) } + pub fn eat(&mut self, ch: char) -> bool { + if self.is(ch) { + self.index += 1; + true + } else { + false + } + } + + pub fn eof(&self) -> bool { + self.index < self.lexer.chars.len() + } + pub fn nth(&self, n: usize) -> Option<&char> { self.lexer.chars.get(self.index + n) } @@ -108,7 +135,7 @@ pub enum SourceKind { } pub fn parse_literal<'a>(parser: &mut Parser<'a>) -> RegExpLiteral<'a> { - if parser.eat('/') { + if parser.is('/') { parser.advance(); let pattern = parse_pattern(parser); todo!() @@ -143,15 +170,122 @@ fn parse_pattern<'a>(parser: &mut Parser<'a>) -> Pattern<'a> { fn parse_pattern_internal<'a>(parser: &mut Parser<'a>) -> Option> { let start = parser.index; - let + parser.num_capturing_parens = count_capturing_parens(parser); + parser.group_names.clear(); + parser.back_reference_names.clear(); todo!() } +fn parse_disjunction<'a>(parser: &mut Parser<'a>) { + let start = parser.index; + let mut i = 0; + loop {} +} + +fn parser_alternative<'a>(parser: &mut Parser<'a>, i: usize) -> Alternative<'a> { + let start = parser.index; + // let mut elements = vec![]; + while !parser.eof() {} + Alternative { span: todo!(), elements: todo!() } +} + +fn parse_term<'a>(parser: &mut Parser<'a>) -> (bool, Option>) { + if parser.context.unicode_mode || parser.context.strict {} + todo!() +} + +fn parse_quantifier<'a>( + parser: &mut Parser<'a>, + no_consume: Option, +) -> (bool, Option>) { + let mut no_consume = no_consume.unwrap_or_default(); + let start = parser.index; + let mut min = 0; + let mut max = 0; + let mut greedy = false; + let mut element = None; + match parser.current().cloned() { + Some('*') => { + min = 0; + max = usize::MAX; + parser.advance(); + } + Some('+') => { + min = 1; + max = usize::MAX; + parser.advance(); + } + Some('?') => { + min = 0; + max = 1; + parser.advance(); + } + Some(_) => { + if parse_braced_quantifier(parser, no_consume) { + min = parser.last_range.start; + max = parser.last_range.end; + } + } + None => return (false, None), + } + greedy = !parser.eat('?'); + + if !no_consume { + element = Some(Element::Quantifier(Quantifier { + span: Span { start: start as u32, end: parser.index as u32 }, + min, + max, + greedy, + // https://github.com/eslint-community/regexpp/blob/2e8f1af992fb12eae46a446253e8fa3f6cede92a/src/parser.ts#L269-L275 + // it can't be null, or the program will panic, so we put a dummy element, and parent + // should replace it + element: QuantifiableElement::Character(Character { span: Span::default(), value: 0 }), + })) + } + (true, element) +} + +fn parse_braced_quantifier<'a>(parser: &mut Parser<'a>, no_error: bool) -> bool { + let start = parser.index; + if eat_decimal_digits(parser) { + let min = parser.last_int_value; + let mut max = min; + if parser.eat(',') { + max = if eat_decimal_digits(parser) { parser.last_int_value } else { usize::MAX }; + } + if parser.eat('}') { + if !no_error && max < min { + panic!("numbers out of order in {{}} quantifier"); + } + parser.last_range = min..max; + return true; + } + } + if !no_error && (parser.context.unicode_mode || parser.context.strict) { + panic!("Incomplete quantifier"); + } + parser.rewind(start); + false +} + +fn eat_decimal_digits<'a>(parser: &mut Parser<'a>) -> bool { + let start = parser.index; + parser.last_int_value = 0; + while let Some(ch) = parser.current() { + let Some(d) = ch.to_digit(10) else { + break; + }; + parser.last_int_value = 10 * parser.last_int_value + d as usize; + parser.advance(); + } + parser.index != start +} + fn count_capturing_parens<'a>(parser: &mut Parser<'a>) -> usize { let start = parser.index; let mut in_class = false; let mut escaped = false; - let count = 0; + let mut count = 0; while let Some(ch) = parser.current() { if escaped { escaped = false; @@ -165,7 +299,8 @@ fn count_capturing_parens<'a>(parser: &mut Parser<'a>) -> usize { } '(' if !in_class => { if parser.next() != Some(&'?') - || (parser.nth(2) == Some(&'<') && !matches!(parser.nth(3), '=' | '!')) + || (parser.nth(2) == Some(&'<') + && !matches!(parser.nth(3), Some(&'=') | Some(&'!'))) { count += 1; } From cd1e5b4cfb73d1415679b26c654b71eaf2e98ef7 Mon Sep 17 00:00:00 2001 From: IWANABETHATGUY Date: Thu, 21 Dec 2023 02:00:24 +0800 Subject: [PATCH 03/19] =?UTF-8?q?chore:=20=F0=9F=A4=96=20ck=20point?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/oxc_js_regex/src/parser.rs | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/crates/oxc_js_regex/src/parser.rs b/crates/oxc_js_regex/src/parser.rs index 7dc0e04253d6f..029cc2a2cddca 100644 --- a/crates/oxc_js_regex/src/parser.rs +++ b/crates/oxc_js_regex/src/parser.rs @@ -77,6 +77,9 @@ impl<'a> Parser<'a> { } } + fn alloc(&self, val: T) -> &mut T { + self.lexer.allocator.alloc(val) + } pub fn is(&self, ch: char) -> bool { self.lexer.chars.get(self.index) == Some(&ch) } @@ -194,6 +197,10 @@ fn parse_term<'a>(parser: &mut Parser<'a>) -> (bool, Option>) { todo!() } +fn parse_assertion<'a>(parser: &mut Parser<'a>) -> (bool, Option>) { + todo!() +} + fn parse_quantifier<'a>( parser: &mut Parser<'a>, no_consume: Option, @@ -231,7 +238,7 @@ fn parse_quantifier<'a>( greedy = !parser.eat('?'); if !no_consume { - element = Some(Element::Quantifier(Quantifier { + let quantifier = parser.alloc(Quantifier { span: Span { start: start as u32, end: parser.index as u32 }, min, max, @@ -240,7 +247,9 @@ fn parse_quantifier<'a>( // it can't be null, or the program will panic, so we put a dummy element, and parent // should replace it element: QuantifiableElement::Character(Character { span: Span::default(), value: 0 }), - })) + }); + + element = Some(Element::Quantifier(quantifier)) } (true, element) } From 95287445dc6cf77b393f13a70c94ba80168227ce Mon Sep 17 00:00:00 2001 From: IWANABETHATGUY Date: Sun, 14 Jan 2024 16:24:33 +0800 Subject: [PATCH 04/19] =?UTF-8?q?fix:=20=F0=9F=90=9B=20compile=20error?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ' | 355 ++++++++++++++++++++++++++++++ crates/oxc_js_regex/src/ast.rs | 2 +- crates/oxc_js_regex/src/parser.rs | 54 +++-- 3 files changed, 395 insertions(+), 16 deletions(-) create mode 100644 ' diff --git a/' b/' new file mode 100644 index 0000000000000..bec9fec060094 --- /dev/null +++ b/' @@ -0,0 +1,355 @@ +use std::collections::{HashSet, VecDeque}; +use std::iter::Peekable; +use std::ops::Range; +use std::os::unix::fs::OpenOptionsExt; +use std::panic; +use std::str::{CharIndices, Chars, Matches}; + +use oxc_allocator::Allocator; +use oxc_diagnostics::Error; +use oxc_span::Span; + +use crate::ast::{ + Alternative, Assertion, Branch, Character, Element, Pattern, QuantifiableElement, Quantifier, + RegExpLiteral, +}; +use crate::ecma_version::EcmaVersion; + +pub struct Lexer<'a> { + allocator: &'a Allocator, + + source: &'a str, + /// Regex usually, use a collected `Vec` could reduce lookahead and other util function implementation complexity + chars: Vec, + + pub(crate) errors: Vec, +} + +#[allow(clippy::unused_self)] +impl<'a> Lexer<'a> { + pub fn new(allocator: &'a Allocator, source: &'a str) -> Self { + Self { source, allocator, errors: vec![], chars: source.chars().collect::>() } + } +} + +pub struct Parser<'a> { + lexer: Lexer<'a>, + + /// Source Code + source_text: &'a str, + + /// All syntax errors from parser and lexer + /// Note: favor adding to `Diagnostics` instead of raising Err + errors: Vec, + context: ParserContext, + index: usize, + group_names: HashSet, + num_capturing_parens: usize, + last_int_value: usize, + back_reference_names: HashSet, + last_assertion_is_quantifiable: bool, + last_range: Range, +} + +#[derive(Default, Copy, Clone)] +struct ParserContext { + source_kind: SourceKind, + unicode_mode: bool, + nflag: bool, + unicode_sets_mode: bool, + ecma_version: EcmaVersion, + strict: bool, +} + +impl<'a> Parser<'a> { + /// Create a new parser + pub fn new(allocator: &'a Allocator, source_text: &'a str) -> Self { + Self { + lexer: Lexer::new(allocator, source_text), + source_text, + errors: vec![], + context: ParserContext::default(), + index: 0, + group_names: HashSet::new(), + num_capturing_parens: 0, + back_reference_names: HashSet::new(), + last_int_value: 0, + last_range: 0..0, + last_assertion_is_quantifiable: false, + } + } + + // fn alloc(&self, val: T) -> T { + // self.lexer.allocator.alloc(val) + // } + + #[inline] + pub fn alloc(&self, value: T) -> Box<'a, T> { + Box(self.lexer.allocator.alloc(value)) + } + pub fn is(&self, ch: char) -> bool { + self.lexer.chars.get(self.index) == Some(&ch) + } + + pub fn eat(&mut self, ch: char) -> bool { + if self.is(ch) { + self.index += 1; + true + } else { + false + } + } + + pub fn eof(&self) -> bool { + self.index < self.lexer.chars.len() + } + + pub fn nth(&self, n: usize) -> Option<&char> { + self.lexer.chars.get(self.index + n) + } + + /// by default next means `next_1` + pub fn next(&self) -> Option<&char> { + self.lexer.chars.get(self.index + 1) + } + + /// get a range chars relative from current cursor + pub fn nrange(&self, range: Range) -> Option<&[char]> { + self.lexer.chars.get(self.index + range.start..(self.index + range.end)) + } + + pub fn current(&self) -> Option<&char> { + self.lexer.chars.get(self.index) + } + + pub fn advance(&mut self) -> bool { + if self.index < self.lexer.chars.len() { + self.index += 1; + return true; + } else { + false + } + } + + pub fn rewind(&mut self, start: usize) { + self.index = start; + } +} + +#[derive(Default, Clone, Copy)] +pub enum SourceKind { + Flags, + #[default] + Literal, + Pattern, +} + +pub fn parse_literal<'a>(parser: &mut Parser<'a>) -> RegExpLiteral<'a> { + if parser.is('/') { + parser.advance(); + let pattern = parse_pattern(parser); + todo!() + } else if parser.source_text.is_empty() { + panic!("Empty") + } else { + match parser.current() { + Some(ch) => { + panic!("unexpected character {ch}") + } + None => { + panic!("unexpected eof") + } + }; + } +} + +fn parse_pattern<'a>(parser: &mut Parser<'a>) -> Pattern<'a> { + let start = parser.index; + if let Some(pattern) = parse_pattern_internal(parser) { + return pattern; + } else if !parser.context.nflag + && parser.context.ecma_version >= EcmaVersion::V2018 + && parser.group_names.len() > 0 + { + parser.rewind(start); + parser.context.nflag = true; + return parse_pattern_internal(parser).expect("should have pattern"); + } + panic!("Invalid pattern") +} + +fn parse_pattern_internal<'a>(parser: &mut Parser<'a>) -> Option> { + let start = parser.index; + parser.num_capturing_parens = count_capturing_parens(parser); + parser.group_names.clear(); + parser.back_reference_names.clear(); + todo!() +} + +fn parse_disjunction<'a>(parser: &mut Parser<'a>) { + let start = parser.index; + let mut i = 0; + loop {} +} + +/// Validate the next characters as a RegExp `Alternative` production. +/// ``` +/// Alternative[UnicodeMode, UnicodeSetsMode, N]:: +/// [empty] +/// Alternative[?UnicodeMode, ?UnicodeSetsMode, ?N] Term[?UnicodeMode, ?UnicodeSetsMode, ?N] +/// ``` +fn parser_alternative<'a>(parser: &mut Parser<'a>) -> Alternative<'a> { + let start = parser.index; + let mut elements = vec![]; + while !parser.eof() { + let term = parse_term(p); + } + Alternative { span: Span::new(start, parser.index), elements } +} + +fn parse_term<'a>(parser: &mut Parser<'a>) -> (bool, Option>) { + if parser.context.unicode_mode || parser.context.strict {} + todo!() +} + +fn parse_assertion<'a>(parser: &mut Parser<'a>) -> (bool, Option>) { + let start = parser.index; + parser.last_assertion_is_quantifiable = false; + + todo!() +} + +/// Validate the next characters as a RegExp `Quantifier` production if possible. +/// ``` +/// Quantifier:: +/// QuantifierPrefix +/// QuantifierPrefix `?` +/// QuantifierPrefix:: +/// `*` +/// `+` +/// `?` +/// `{` DecimalDigits `}` +/// `{` DecimalDigits `,}` +/// `{` DecimalDigits `,` DecimalDigits `}` +/// ``` +/// returns `true` if it consumed the next characters successfully. +fn parse_quantifier<'a>( + parser: &mut Parser<'a>, + no_consume: Option, +) -> (bool, Option>) { + let mut no_consume = no_consume.unwrap_or_default(); + let start = parser.index; + let mut min = 0; + let mut max = 0; + let mut greedy = false; + let mut element = None; + match parser.current().cloned() { + Some('*') => { + min = 0; + max = usize::MAX; + parser.advance(); + } + Some('+') => { + min = 1; + max = usize::MAX; + parser.advance(); + } + Some('?') => { + min = 0; + max = 1; + parser.advance(); + } + Some(_) => { + if parse_braced_quantifier(parser, no_consume) { + min = parser.last_range.start; + max = parser.last_range.end; + } + } + None => return (false, None), + } + greedy = !parser.eat('?'); + + if !no_consume { + let quantifier = parser.alloc(Quantifier { + span: Span { start: start as u32, end: parser.index as u32 }, + min, + max, + greedy, + // https://github.com/eslint-community/regexpp/blob/2e8f1af992fb12eae46a446253e8fa3f6cede92a/src/parser.ts#L269-L275 + // it can't be null, or the program will panic, so we put a dummy element, and parent + // should replace it + element: QuantifiableElement::Character(Character { span: Span::default(), value: 0 }), + }); + + element = Some(Element::Quantifier(quantifier)) + } + (true, element) +} + +fn parse_braced_quantifier<'a>(parser: &mut Parser<'a>, no_error: bool) -> bool { + let start = parser.index; + if eat_decimal_digits(parser) { + let min = parser.last_int_value; + let mut max = min; + if parser.eat(',') { + max = if eat_decimal_digits(parser) { parser.last_int_value } else { usize::MAX }; + } + if parser.eat('}') { + if !no_error && max < min { + panic!("numbers out of order in {{}} quantifier"); + } + parser.last_range = min..max; + return true; + } + } + if !no_error && (parser.context.unicode_mode || parser.context.strict) { + panic!("Incomplete quantifier"); + } + parser.rewind(start); + false +} + +fn eat_decimal_digits<'a>(parser: &mut Parser<'a>) -> bool { + let start = parser.index; + parser.last_int_value = 0; + while let Some(ch) = parser.current() { + let Some(d) = ch.to_digit(10) else { + break; + }; + parser.last_int_value = 10 * parser.last_int_value + d as usize; + parser.advance(); + } + parser.index != start +} + +fn count_capturing_parens<'a>(parser: &mut Parser<'a>) -> usize { + let start = parser.index; + let mut in_class = false; + let mut escaped = false; + let mut count = 0; + while let Some(ch) = parser.current() { + if escaped { + escaped = false; + } + match ch { + '\\' => { + escaped = true; + } + '[' | ']' => { + in_class = false; + } + '(' if !in_class => { + if parser.next() != Some(&'?') + || (parser.nth(2) == Some(&'<') + && !matches!(parser.nth(3), Some(&'=') | Some(&'!'))) + { + count += 1; + } + } + _ => {} + } + parser.advance(); + } + parser.rewind(start); + count +} diff --git a/crates/oxc_js_regex/src/ast.rs b/crates/oxc_js_regex/src/ast.rs index 0650f3053f4bc..5d429b4f063b2 100644 --- a/crates/oxc_js_regex/src/ast.rs +++ b/crates/oxc_js_regex/src/ast.rs @@ -46,7 +46,7 @@ pub enum Leaf<'a> { pub enum Element<'a> { Assertion(Assertion<'a>), QuantifiableElement(QuantifiableElement<'a>), - Quantifier(Quantifier<'a>), + Quantifier(Box<'a, Quantifier<'a>>), } /// The type which includes all atom nodes that Quantifier node can have as children. diff --git a/crates/oxc_js_regex/src/parser.rs b/crates/oxc_js_regex/src/parser.rs index 029cc2a2cddca..bc8dd4428d208 100644 --- a/crates/oxc_js_regex/src/parser.rs +++ b/crates/oxc_js_regex/src/parser.rs @@ -5,7 +5,6 @@ use std::os::unix::fs::OpenOptionsExt; use std::panic; use std::str::{CharIndices, Chars, Matches}; -use oxc_allocator::Allocator; use oxc_diagnostics::Error; use oxc_span::Span; @@ -13,11 +12,10 @@ use crate::ast::{ Alternative, Assertion, Branch, Character, Element, Pattern, QuantifiableElement, Quantifier, RegExpLiteral, }; +use crate::ast_builder::AstBuilder; use crate::ecma_version::EcmaVersion; pub struct Lexer<'a> { - allocator: &'a Allocator, - source: &'a str, /// Regex usually, use a collected `Vec` could reduce lookahead and other util function implementation complexity chars: Vec, @@ -27,13 +25,14 @@ pub struct Lexer<'a> { #[allow(clippy::unused_self)] impl<'a> Lexer<'a> { - pub fn new(allocator: &'a Allocator, source: &'a str) -> Self { - Self { source, allocator, errors: vec![], chars: source.chars().collect::>() } + pub fn new(source: &'a str) -> Self { + Self { source, errors: vec![], chars: source.chars().collect::>() } } } pub struct Parser<'a> { lexer: Lexer<'a>, + builder: AstBuilder<'a>, /// Source Code source_text: &'a str, @@ -47,6 +46,7 @@ pub struct Parser<'a> { num_capturing_parens: usize, last_int_value: usize, back_reference_names: HashSet, + last_assertion_is_quantifiable: bool, last_range: Range, } @@ -62,9 +62,9 @@ struct ParserContext { impl<'a> Parser<'a> { /// Create a new parser - pub fn new(allocator: &'a Allocator, source_text: &'a str) -> Self { + pub fn new(allocator: &'a oxc_allocator::Allocator, source_text: &'a str) -> Self { Self { - lexer: Lexer::new(allocator, source_text), + lexer: Lexer::new(source_text), source_text, errors: vec![], context: ParserContext::default(), @@ -74,12 +74,11 @@ impl<'a> Parser<'a> { back_reference_names: HashSet::new(), last_int_value: 0, last_range: 0..0, + last_assertion_is_quantifiable: false, + builder: AstBuilder::new(allocator), } } - fn alloc(&self, val: T) -> &mut T { - self.lexer.allocator.alloc(val) - } pub fn is(&self, ch: char) -> bool { self.lexer.chars.get(self.index) == Some(&ch) } @@ -185,11 +184,19 @@ fn parse_disjunction<'a>(parser: &mut Parser<'a>) { loop {} } -fn parser_alternative<'a>(parser: &mut Parser<'a>, i: usize) -> Alternative<'a> { +/// Validate the next characters as a RegExp `Alternative` production. +/// ``` +/// Alternative[UnicodeMode, UnicodeSetsMode, N]:: +/// [empty] +/// Alternative[?UnicodeMode, ?UnicodeSetsMode, ?N] Term[?UnicodeMode, ?UnicodeSetsMode, ?N] +/// ``` +fn parser_alternative<'a>(parser: &mut Parser<'a>) -> Alternative<'a> { let start = parser.index; - // let mut elements = vec![]; - while !parser.eof() {} - Alternative { span: todo!(), elements: todo!() } + let mut elements = parser.builder.new_vec(); + while !parser.eof() { + let term = parse_term(parser); + } + Alternative { span: Span::new(start as u32, parser.index as u32), elements } } fn parse_term<'a>(parser: &mut Parser<'a>) -> (bool, Option>) { @@ -198,9 +205,26 @@ fn parse_term<'a>(parser: &mut Parser<'a>) -> (bool, Option>) { } fn parse_assertion<'a>(parser: &mut Parser<'a>) -> (bool, Option>) { + let start = parser.index; + parser.last_assertion_is_quantifiable = false; + todo!() } +/// Validate the next characters as a RegExp `Quantifier` production if possible. +/// ``` +/// Quantifier:: +/// QuantifierPrefix +/// QuantifierPrefix `?` +/// QuantifierPrefix:: +/// `*` +/// `+` +/// `?` +/// `{` DecimalDigits `}` +/// `{` DecimalDigits `,}` +/// `{` DecimalDigits `,` DecimalDigits `}` +/// ``` +/// returns `true` if it consumed the next characters successfully. fn parse_quantifier<'a>( parser: &mut Parser<'a>, no_consume: Option, @@ -238,7 +262,7 @@ fn parse_quantifier<'a>( greedy = !parser.eat('?'); if !no_consume { - let quantifier = parser.alloc(Quantifier { + let quantifier = parser.builder.alloc(Quantifier { span: Span { start: start as u32, end: parser.index as u32 }, min, max, From e8e1d922925239cb9dbbff15d73fe1b9de902a7b Mon Sep 17 00:00:00 2001 From: IWANABETHATGUY Date: Sun, 14 Jan 2024 16:28:58 +0800 Subject: [PATCH 05/19] =?UTF-8?q?feat:=20=F0=9F=8E=B8=20parse=20alternativ?= =?UTF-8?q?e?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/oxc_js_regex/src/parser.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/crates/oxc_js_regex/src/parser.rs b/crates/oxc_js_regex/src/parser.rs index bc8dd4428d208..1517bcdd44f70 100644 --- a/crates/oxc_js_regex/src/parser.rs +++ b/crates/oxc_js_regex/src/parser.rs @@ -194,7 +194,13 @@ fn parser_alternative<'a>(parser: &mut Parser<'a>) -> Alternative<'a> { let start = parser.index; let mut elements = parser.builder.new_vec(); while !parser.eof() { - let term = parse_term(parser); + let (flag, node) = parse_term(parser); + if let Some(node) = node { + elements.push(node); + } + if !flag { + break; + } } Alternative { span: Span::new(start as u32, parser.index as u32), elements } } From 9c574e87a70084a4ae37d225814747ca2fdb5cad Mon Sep 17 00:00:00 2001 From: IWANABETHATGUY Date: Sun, 14 Jan 2024 16:38:03 +0800 Subject: [PATCH 06/19] =?UTF-8?q?feat:=20=F0=9F=8E=B8=20disjunction?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/oxc_js_regex/src/parser.rs | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/crates/oxc_js_regex/src/parser.rs b/crates/oxc_js_regex/src/parser.rs index 1517bcdd44f70..6e26300c4d18c 100644 --- a/crates/oxc_js_regex/src/parser.rs +++ b/crates/oxc_js_regex/src/parser.rs @@ -178,10 +178,23 @@ fn parse_pattern_internal<'a>(parser: &mut Parser<'a>) -> Option> { todo!() } -fn parse_disjunction<'a>(parser: &mut Parser<'a>) { +fn parse_disjunction<'a>(parser: &mut Parser<'a>) -> oxc_allocator::Vec<'a, Alternative<'a>> { let start = parser.index; - let mut i = 0; - loop {} + let mut ret = parser.builder.new_vec(); + loop { + ret.push(parser_alternative(parser)); + if !parser.eat('|') { + break; + } + } + // Only consume the ast when `no_consume` is false + if parse_quantifier(parser, Some(true)).0 { + panic!("Nothing to repeat"); + } + if parser.eat('{') { + panic!("Lone quantifier brackets") + } + ret } /// Validate the next characters as a RegExp `Alternative` production. From e25ac163aeb03b062c474d7e836cfaf0f21b8de8 Mon Sep 17 00:00:00 2001 From: IWANABETHATGUY Date: Sun, 14 Jan 2024 17:22:12 +0800 Subject: [PATCH 07/19] =?UTF-8?q?feat:=20=F0=9F=8E=B8=20parse=20assertion?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ; | 377 ++++++++++++++++++++++++++++++ crates/oxc_js_regex/src/parser.rs | 102 +++++++- 2 files changed, 472 insertions(+), 7 deletions(-) create mode 100644 ; diff --git a/; b/; new file mode 100644 index 0000000000000..acc0dcbc178a7 --- /dev/null +++ b/; @@ -0,0 +1,377 @@ +use std::collections::{HashSet, VecDeque}; +use std::iter::Peekable; +use std::ops::Range; +use std::os::unix::fs::OpenOptionsExt; +use std::panic; +use std::str::{CharIndices, Chars, Matches}; + +use oxc_diagnostics::Error; +use oxc_span::Span; + +use crate::ast::{ + Alternative, Assertion, BoundaryAssertion, Branch, Character, EdgeAssertion, Element, Pattern, + QuantifiableElement, Quantifier, RegExpLiteral, +}; +use crate::ast_builder::AstBuilder; +use crate::ecma_version::EcmaVersion; + +pub struct Lexer<'a> { + source: &'a str, + /// Regex usually, use a collected `Vec` could reduce lookahead and other util function implementation complexity + chars: Vec, + + pub(crate) errors: Vec, +} + +#[allow(clippy::unused_self)] +impl<'a> Lexer<'a> { + pub fn new(source: &'a str) -> Self { + Self { source, errors: vec![], chars: source.chars().collect::>() } + } +} + +pub struct Parser<'a> { + lexer: Lexer<'a>, + builder: AstBuilder<'a>, + + /// Source Code + source_text: &'a str, + + /// All syntax errors from parser and lexer + /// Note: favor adding to `Diagnostics` instead of raising Err + errors: Vec, + context: ParserContext, + index: usize, + group_names: HashSet, + num_capturing_parens: usize, + last_int_value: usize, + back_reference_names: HashSet, + last_assertion_is_quantifiable: bool, + last_range: Range, +} + +#[derive(Default, Copy, Clone)] +struct ParserContext { + source_kind: SourceKind, + unicode_mode: bool, + nflag: bool, + unicode_sets_mode: bool, + ecma_version: EcmaVersion, + strict: bool, +} + +impl<'a> Parser<'a> { + /// Create a new parser + pub fn new(allocator: &'a oxc_allocator::Allocator, source_text: &'a str) -> Self { + Self { + lexer: Lexer::new(source_text), + source_text, + errors: vec![], + context: ParserContext::default(), + index: 0, + group_names: HashSet::new(), + num_capturing_parens: 0, + back_reference_names: HashSet::new(), + last_int_value: 0, + last_range: 0..0, + last_assertion_is_quantifiable: false, + builder: AstBuilder::new(allocator), + } + } + + pub fn is(&self, ch: char) -> bool { + self.lexer.chars.get(self.index) == Some(&ch) + } + + pub fn eat(&mut self, ch: char) -> bool { + if self.is(ch) { + self.index += 1; + true + } else { + false + } + } + + pub fn eof(&self) -> bool { + self.index < self.lexer.chars.len() + } + + pub fn nth(&self, n: usize) -> Option<&char> { + self.lexer.chars.get(self.index + n) + } + + /// by default next means `next_1` + pub fn next(&self) -> Option<&char> { + self.lexer.chars.get(self.index + 1) + } + + /// get a range chars relative from current cursor + pub fn nrange(&self, range: Range) -> Option<&[char]> { + self.lexer.chars.get(self.index + range.start..(self.index + range.end)) + } + + pub fn current(&self) -> Option<&char> { + self.lexer.chars.get(self.index) + } + + pub fn advance(&mut self) -> bool { + if self.index < self.lexer.chars.len() { + self.index += 1; + return true; + } else { + false + } + } + + pub fn rewind(&mut self, start: usize) { + self.index = start; + } +} + +#[derive(Default, Clone, Copy)] +pub enum SourceKind { + Flags, + #[default] + Literal, + Pattern, +} + +pub fn parse_literal<'a>(parser: &mut Parser<'a>) -> RegExpLiteral<'a> { + if parser.is('/') { + parser.advance(); + let pattern = parse_pattern(parser); + todo!() + } else if parser.source_text.is_empty() { + panic!("Empty") + } else { + match parser.current() { + Some(ch) => { + panic!("unexpected character {ch}") + } + None => { + panic!("unexpected eof") + } + }; + } +} + +fn parse_pattern<'a>(parser: &mut Parser<'a>) -> Pattern<'a> { + let start = parser.index; + if let Some(pattern) = parse_pattern_internal(parser) { + return pattern; + } else if !parser.context.nflag + && parser.context.ecma_version >= EcmaVersion::V2018 + && parser.group_names.len() > 0 + { + parser.rewind(start); + parser.context.nflag = true; + return parse_pattern_internal(parser).expect("should have pattern"); + } + panic!("Invalid pattern") +} + +fn parse_pattern_internal<'a>(parser: &mut Parser<'a>) -> Option> { + let start = parser.index; + parser.num_capturing_parens = count_capturing_parens(parser); + parser.group_names.clear(); + parser.back_reference_names.clear(); + todo!() +} + +fn parse_disjunction<'a>(parser: &mut Parser<'a>) -> oxc_allocator::Vec<'a, Alternative<'a>> { + let start = parser.index; + let mut alternatives = parser.builder.new_vec(); + loop { + alternatives.push(parser_alternative(parser)); + if !parser.eat('|') { + break; + } + } + // Only consume the ast when `no_consume` is false + if parse_quantifier(parser, Some(true)).0 { + panic!("Nothing to repeat"); + } + if parser.eat('{') { + panic!("Lone quantifier brackets") + } + alternatives +} + +/// Validate the next characters as a RegExp `Alternative` production. +/// ``` +/// Alternative[UnicodeMode, UnicodeSetsMode, N]:: +/// [empty] +/// Alternative[?UnicodeMode, ?UnicodeSetsMode, ?N] Term[?UnicodeMode, ?UnicodeSetsMode, ?N] +/// ``` +fn parser_alternative<'a>(parser: &mut Parser<'a>) -> Alternative<'a> { + let start = parser.index; + let mut elements = parser.builder.new_vec(); + while !parser.eof() { + let (flag, node) = parse_term(parser); + if let Some(node) = node { + elements.push(node); + } + if !flag { + break; + } + } + Alternative { span: Span::new(start as u32, parser.index as u32), elements } +} + +fn parse_term<'a>(parser: &mut Parser<'a>) -> (bool, Option>) { + if parser.context.unicode_mode || parser.context.strict {} + todo!() +} + +fn parse_assertion<'a>(parser: &mut Parser<'a>) -> (bool, Option>) { + let start = parser.index; + parser.last_assertion_is_quantifiable = false; + + if parser.eat('^') { + return ( + true, + Some(Assertion::BoundaryAssertion(parser.builder.alloc(BoundaryAssertion::EdgeAssertion( + parser.builder.alloc(EdgeAssertion { + span: Span::new(start as u32, parser.index as u32), + kind: todo!(), + }), + ))), + ); + } + todo!() +} + +/// Validate the next characters as a RegExp `Quantifier` production if possible. +/// ``` +/// Quantifier:: +/// QuantifierPrefix +/// QuantifierPrefix `?` +/// QuantifierPrefix:: +/// `*` +/// `+` +/// `?` +/// `{` DecimalDigits `}` +/// `{` DecimalDigits `,}` +/// `{` DecimalDigits `,` DecimalDigits `}` +/// ``` +/// returns `true` if it consumed the next characters successfully. +fn parse_quantifier<'a>( + parser: &mut Parser<'a>, + no_consume: Option, +) -> (bool, Option>) { + let mut no_consume = no_consume.unwrap_or_default(); + let start = parser.index; + let mut min = 0; + let mut max = 0; + let mut greedy = false; + let mut element = None; + match parser.current().cloned() { + Some('*') => { + min = 0; + max = usize::MAX; + parser.advance(); + } + Some('+') => { + min = 1; + max = usize::MAX; + parser.advance(); + } + Some('?') => { + min = 0; + max = 1; + parser.advance(); + } + Some(_) => { + if parse_braced_quantifier(parser, no_consume) { + min = parser.last_range.start; + max = parser.last_range.end; + } + } + None => return (false, None), + } + greedy = !parser.eat('?'); + + if !no_consume { + let quantifier = parser.builder.alloc(Quantifier { + span: Span { start: start as u32, end: parser.index as u32 }, + min, + max, + greedy, + // https://github.com/eslint-community/regexpp/blob/2e8f1af992fb12eae46a446253e8fa3f6cede92a/src/parser.ts#L269-L275 + // it can't be null, or the program will panic, so we put a dummy element, and parent + // should replace it + element: QuantifiableElement::Character(Character { span: Span::default(), value: 0 }), + }); + + element = Some(Element::Quantifier(quantifier)) + } + (true, element) +} + +fn parse_braced_quantifier<'a>(parser: &mut Parser<'a>, no_error: bool) -> bool { + let start = parser.index; + if eat_decimal_digits(parser) { + let min = parser.last_int_value; + let mut max = min; + if parser.eat(',') { + max = if eat_decimal_digits(parser) { parser.last_int_value } else { usize::MAX }; + } + if parser.eat('}') { + if !no_error && max < min { + panic!("numbers out of order in {{}} quantifier"); + } + parser.last_range = min..max; + return true; + } + } + if !no_error && (parser.context.unicode_mode || parser.context.strict) { + panic!("Incomplete quantifier"); + } + parser.rewind(start); + false +} + +fn eat_decimal_digits<'a>(parser: &mut Parser<'a>) -> bool { + let start = parser.index; + parser.last_int_value = 0; + while let Some(ch) = parser.current() { + let Some(d) = ch.to_digit(10) else { + break; + }; + parser.last_int_value = 10 * parser.last_int_value + d as usize; + parser.advance(); + } + parser.index != start +} + +fn count_capturing_parens<'a>(parser: &mut Parser<'a>) -> usize { + let start = parser.index; + let mut in_class = false; + let mut escaped = false; + let mut count = 0; + while let Some(ch) = parser.current() { + if escaped { + escaped = false; + } + match ch { + '\\' => { + escaped = true; + } + '[' | ']' => { + in_class = false; + } + '(' if !in_class => { + if parser.next() != Some(&'?') + || (parser.nth(2) == Some(&'<') + && !matches!(parser.nth(3), Some(&'=') | Some(&'!'))) + { + count += 1; + } + } + _ => {} + } + parser.advance(); + } + parser.rewind(start); + count +} diff --git a/crates/oxc_js_regex/src/parser.rs b/crates/oxc_js_regex/src/parser.rs index 6e26300c4d18c..e7854d063d509 100644 --- a/crates/oxc_js_regex/src/parser.rs +++ b/crates/oxc_js_regex/src/parser.rs @@ -9,8 +9,9 @@ use oxc_diagnostics::Error; use oxc_span::Span; use crate::ast::{ - Alternative, Assertion, Branch, Character, Element, Pattern, QuantifiableElement, Quantifier, - RegExpLiteral, + Alternative, Assertion, BoundaryAssertion, Branch, Character, EdgeAssertion, EdgeAssertionKind, + Element, LookaheadAssertion, LookaroundAssertion, LookbehindAssertion, Pattern, + QuantifiableElement, Quantifier, RegExpLiteral, WordBoundaryAssertion, }; use crate::ast_builder::AstBuilder; use crate::ecma_version::EcmaVersion; @@ -92,6 +93,15 @@ impl<'a> Parser<'a> { } } + pub fn eat2(&mut self, first: char, second: char) -> bool { + if self.is(first) && self.nth(1) == Some(&second) { + self.index += 2; + true + } else { + false + } + } + pub fn eof(&self) -> bool { self.index < self.lexer.chars.len() } @@ -180,9 +190,9 @@ fn parse_pattern_internal<'a>(parser: &mut Parser<'a>) -> Option> { fn parse_disjunction<'a>(parser: &mut Parser<'a>) -> oxc_allocator::Vec<'a, Alternative<'a>> { let start = parser.index; - let mut ret = parser.builder.new_vec(); + let mut alternatives = parser.builder.new_vec(); loop { - ret.push(parser_alternative(parser)); + alternatives.push(parse_alternative(parser)); if !parser.eat('|') { break; } @@ -194,7 +204,7 @@ fn parse_disjunction<'a>(parser: &mut Parser<'a>) -> oxc_allocator::Vec<'a, Alte if parser.eat('{') { panic!("Lone quantifier brackets") } - ret + alternatives } /// Validate the next characters as a RegExp `Alternative` production. @@ -203,7 +213,7 @@ fn parse_disjunction<'a>(parser: &mut Parser<'a>) -> oxc_allocator::Vec<'a, Alte /// [empty] /// Alternative[?UnicodeMode, ?UnicodeSetsMode, ?N] Term[?UnicodeMode, ?UnicodeSetsMode, ?N] /// ``` -fn parser_alternative<'a>(parser: &mut Parser<'a>) -> Alternative<'a> { +fn parse_alternative<'a>(parser: &mut Parser<'a>) -> Alternative<'a> { let start = parser.index; let mut elements = parser.builder.new_vec(); while !parser.eof() { @@ -227,7 +237,85 @@ fn parse_assertion<'a>(parser: &mut Parser<'a>) -> (bool, Option>) let start = parser.index; parser.last_assertion_is_quantifiable = false; - todo!() + if parser.eat('^') { + return ( + true, + Some(Assertion::BoundaryAssertion(parser.builder.alloc( + BoundaryAssertion::EdgeAssertion(parser.builder.alloc(EdgeAssertion { + span: Span::new(start as u32, parser.index as u32), + kind: EdgeAssertionKind::Start, + })), + ))), + ); + } + + if parser.eat('$') { + return ( + true, + Some(Assertion::BoundaryAssertion(parser.builder.alloc( + BoundaryAssertion::EdgeAssertion(parser.builder.alloc(EdgeAssertion { + span: Span::new(start as u32, parser.index as u32), + kind: EdgeAssertionKind::End, + })), + ))), + ); + } + + if parser.eat2('\\', 'B') { + return ( + true, + Some(Assertion::BoundaryAssertion(parser.builder.alloc( + BoundaryAssertion::WordBoundaryAssertion(parser.builder.alloc( + WordBoundaryAssertion { + span: Span::new(start as u32, parser.index as u32), + negate: true, + }, + )), + ))), + ); + } + + if parser.eat2('\\', 'b') { + return ( + true, + Some(Assertion::BoundaryAssertion(parser.builder.alloc( + BoundaryAssertion::WordBoundaryAssertion(parser.builder.alloc( + WordBoundaryAssertion { + span: Span::new(start as u32, parser.index as u32), + negate: false, + }, + )), + ))), + ); + } + + // Lookahead / Lookbehind + if parser.eat2('(', '?') { + let lookbeind = parser.context.ecma_version >= EcmaVersion::V2018 && parser.eat('<'); + let mut eq_sign = parser.eat('='); + let mut negate = if eq_sign { false } else { parser.eat('!') }; + if eq_sign || negate { + let span = Span::new(start as u32, parser.index as u32); + let alternatives = parse_disjunction(parser); + let look_around_assertion = + if lookbeind { + LookaroundAssertion::LookbehindAssertion( + parser.builder.alloc(LookbehindAssertion { span, negate, alternatives }), + ) + } else { + LookaroundAssertion::LookaheadAssertion( + parser.builder.alloc(LookaheadAssertion { span, negate, alternatives }), + ) + }; + let node = Assertion::LookaroundAssertion(parser.builder.alloc(look_around_assertion)); + if !parser.eat(')') { + panic!("Unterminated group") + } + parser.last_assertion_is_quantifiable = !lookbeind && !parser.context.strict; + } + parser.rewind(start); + } + (false, None) } /// Validate the next characters as a RegExp `Quantifier` production if possible. From 54501a7b0e829f834f0b16fa2bb4e91afb9e7590 Mon Sep 17 00:00:00 2001 From: IWANABETHATGUY Date: Sun, 14 Jan 2024 21:36:40 +0800 Subject: [PATCH 08/19] =?UTF-8?q?chore:=20=F0=9F=A4=96=20copy=20all?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ' | 355 --------- ; | 377 ---------- crates/oxc_js_regex/src/ast.rs | 2 +- crates/oxc_js_regex/src/parser.rs | 1120 ++++++++++++++++++++++++++++- 4 files changed, 1118 insertions(+), 736 deletions(-) delete mode 100644 ' delete mode 100644 ; diff --git a/' b/' deleted file mode 100644 index bec9fec060094..0000000000000 --- a/' +++ /dev/null @@ -1,355 +0,0 @@ -use std::collections::{HashSet, VecDeque}; -use std::iter::Peekable; -use std::ops::Range; -use std::os::unix::fs::OpenOptionsExt; -use std::panic; -use std::str::{CharIndices, Chars, Matches}; - -use oxc_allocator::Allocator; -use oxc_diagnostics::Error; -use oxc_span::Span; - -use crate::ast::{ - Alternative, Assertion, Branch, Character, Element, Pattern, QuantifiableElement, Quantifier, - RegExpLiteral, -}; -use crate::ecma_version::EcmaVersion; - -pub struct Lexer<'a> { - allocator: &'a Allocator, - - source: &'a str, - /// Regex usually, use a collected `Vec` could reduce lookahead and other util function implementation complexity - chars: Vec, - - pub(crate) errors: Vec, -} - -#[allow(clippy::unused_self)] -impl<'a> Lexer<'a> { - pub fn new(allocator: &'a Allocator, source: &'a str) -> Self { - Self { source, allocator, errors: vec![], chars: source.chars().collect::>() } - } -} - -pub struct Parser<'a> { - lexer: Lexer<'a>, - - /// Source Code - source_text: &'a str, - - /// All syntax errors from parser and lexer - /// Note: favor adding to `Diagnostics` instead of raising Err - errors: Vec, - context: ParserContext, - index: usize, - group_names: HashSet, - num_capturing_parens: usize, - last_int_value: usize, - back_reference_names: HashSet, - last_assertion_is_quantifiable: bool, - last_range: Range, -} - -#[derive(Default, Copy, Clone)] -struct ParserContext { - source_kind: SourceKind, - unicode_mode: bool, - nflag: bool, - unicode_sets_mode: bool, - ecma_version: EcmaVersion, - strict: bool, -} - -impl<'a> Parser<'a> { - /// Create a new parser - pub fn new(allocator: &'a Allocator, source_text: &'a str) -> Self { - Self { - lexer: Lexer::new(allocator, source_text), - source_text, - errors: vec![], - context: ParserContext::default(), - index: 0, - group_names: HashSet::new(), - num_capturing_parens: 0, - back_reference_names: HashSet::new(), - last_int_value: 0, - last_range: 0..0, - last_assertion_is_quantifiable: false, - } - } - - // fn alloc(&self, val: T) -> T { - // self.lexer.allocator.alloc(val) - // } - - #[inline] - pub fn alloc(&self, value: T) -> Box<'a, T> { - Box(self.lexer.allocator.alloc(value)) - } - pub fn is(&self, ch: char) -> bool { - self.lexer.chars.get(self.index) == Some(&ch) - } - - pub fn eat(&mut self, ch: char) -> bool { - if self.is(ch) { - self.index += 1; - true - } else { - false - } - } - - pub fn eof(&self) -> bool { - self.index < self.lexer.chars.len() - } - - pub fn nth(&self, n: usize) -> Option<&char> { - self.lexer.chars.get(self.index + n) - } - - /// by default next means `next_1` - pub fn next(&self) -> Option<&char> { - self.lexer.chars.get(self.index + 1) - } - - /// get a range chars relative from current cursor - pub fn nrange(&self, range: Range) -> Option<&[char]> { - self.lexer.chars.get(self.index + range.start..(self.index + range.end)) - } - - pub fn current(&self) -> Option<&char> { - self.lexer.chars.get(self.index) - } - - pub fn advance(&mut self) -> bool { - if self.index < self.lexer.chars.len() { - self.index += 1; - return true; - } else { - false - } - } - - pub fn rewind(&mut self, start: usize) { - self.index = start; - } -} - -#[derive(Default, Clone, Copy)] -pub enum SourceKind { - Flags, - #[default] - Literal, - Pattern, -} - -pub fn parse_literal<'a>(parser: &mut Parser<'a>) -> RegExpLiteral<'a> { - if parser.is('/') { - parser.advance(); - let pattern = parse_pattern(parser); - todo!() - } else if parser.source_text.is_empty() { - panic!("Empty") - } else { - match parser.current() { - Some(ch) => { - panic!("unexpected character {ch}") - } - None => { - panic!("unexpected eof") - } - }; - } -} - -fn parse_pattern<'a>(parser: &mut Parser<'a>) -> Pattern<'a> { - let start = parser.index; - if let Some(pattern) = parse_pattern_internal(parser) { - return pattern; - } else if !parser.context.nflag - && parser.context.ecma_version >= EcmaVersion::V2018 - && parser.group_names.len() > 0 - { - parser.rewind(start); - parser.context.nflag = true; - return parse_pattern_internal(parser).expect("should have pattern"); - } - panic!("Invalid pattern") -} - -fn parse_pattern_internal<'a>(parser: &mut Parser<'a>) -> Option> { - let start = parser.index; - parser.num_capturing_parens = count_capturing_parens(parser); - parser.group_names.clear(); - parser.back_reference_names.clear(); - todo!() -} - -fn parse_disjunction<'a>(parser: &mut Parser<'a>) { - let start = parser.index; - let mut i = 0; - loop {} -} - -/// Validate the next characters as a RegExp `Alternative` production. -/// ``` -/// Alternative[UnicodeMode, UnicodeSetsMode, N]:: -/// [empty] -/// Alternative[?UnicodeMode, ?UnicodeSetsMode, ?N] Term[?UnicodeMode, ?UnicodeSetsMode, ?N] -/// ``` -fn parser_alternative<'a>(parser: &mut Parser<'a>) -> Alternative<'a> { - let start = parser.index; - let mut elements = vec![]; - while !parser.eof() { - let term = parse_term(p); - } - Alternative { span: Span::new(start, parser.index), elements } -} - -fn parse_term<'a>(parser: &mut Parser<'a>) -> (bool, Option>) { - if parser.context.unicode_mode || parser.context.strict {} - todo!() -} - -fn parse_assertion<'a>(parser: &mut Parser<'a>) -> (bool, Option>) { - let start = parser.index; - parser.last_assertion_is_quantifiable = false; - - todo!() -} - -/// Validate the next characters as a RegExp `Quantifier` production if possible. -/// ``` -/// Quantifier:: -/// QuantifierPrefix -/// QuantifierPrefix `?` -/// QuantifierPrefix:: -/// `*` -/// `+` -/// `?` -/// `{` DecimalDigits `}` -/// `{` DecimalDigits `,}` -/// `{` DecimalDigits `,` DecimalDigits `}` -/// ``` -/// returns `true` if it consumed the next characters successfully. -fn parse_quantifier<'a>( - parser: &mut Parser<'a>, - no_consume: Option, -) -> (bool, Option>) { - let mut no_consume = no_consume.unwrap_or_default(); - let start = parser.index; - let mut min = 0; - let mut max = 0; - let mut greedy = false; - let mut element = None; - match parser.current().cloned() { - Some('*') => { - min = 0; - max = usize::MAX; - parser.advance(); - } - Some('+') => { - min = 1; - max = usize::MAX; - parser.advance(); - } - Some('?') => { - min = 0; - max = 1; - parser.advance(); - } - Some(_) => { - if parse_braced_quantifier(parser, no_consume) { - min = parser.last_range.start; - max = parser.last_range.end; - } - } - None => return (false, None), - } - greedy = !parser.eat('?'); - - if !no_consume { - let quantifier = parser.alloc(Quantifier { - span: Span { start: start as u32, end: parser.index as u32 }, - min, - max, - greedy, - // https://github.com/eslint-community/regexpp/blob/2e8f1af992fb12eae46a446253e8fa3f6cede92a/src/parser.ts#L269-L275 - // it can't be null, or the program will panic, so we put a dummy element, and parent - // should replace it - element: QuantifiableElement::Character(Character { span: Span::default(), value: 0 }), - }); - - element = Some(Element::Quantifier(quantifier)) - } - (true, element) -} - -fn parse_braced_quantifier<'a>(parser: &mut Parser<'a>, no_error: bool) -> bool { - let start = parser.index; - if eat_decimal_digits(parser) { - let min = parser.last_int_value; - let mut max = min; - if parser.eat(',') { - max = if eat_decimal_digits(parser) { parser.last_int_value } else { usize::MAX }; - } - if parser.eat('}') { - if !no_error && max < min { - panic!("numbers out of order in {{}} quantifier"); - } - parser.last_range = min..max; - return true; - } - } - if !no_error && (parser.context.unicode_mode || parser.context.strict) { - panic!("Incomplete quantifier"); - } - parser.rewind(start); - false -} - -fn eat_decimal_digits<'a>(parser: &mut Parser<'a>) -> bool { - let start = parser.index; - parser.last_int_value = 0; - while let Some(ch) = parser.current() { - let Some(d) = ch.to_digit(10) else { - break; - }; - parser.last_int_value = 10 * parser.last_int_value + d as usize; - parser.advance(); - } - parser.index != start -} - -fn count_capturing_parens<'a>(parser: &mut Parser<'a>) -> usize { - let start = parser.index; - let mut in_class = false; - let mut escaped = false; - let mut count = 0; - while let Some(ch) = parser.current() { - if escaped { - escaped = false; - } - match ch { - '\\' => { - escaped = true; - } - '[' | ']' => { - in_class = false; - } - '(' if !in_class => { - if parser.next() != Some(&'?') - || (parser.nth(2) == Some(&'<') - && !matches!(parser.nth(3), Some(&'=') | Some(&'!'))) - { - count += 1; - } - } - _ => {} - } - parser.advance(); - } - parser.rewind(start); - count -} diff --git a/; b/; deleted file mode 100644 index acc0dcbc178a7..0000000000000 --- a/; +++ /dev/null @@ -1,377 +0,0 @@ -use std::collections::{HashSet, VecDeque}; -use std::iter::Peekable; -use std::ops::Range; -use std::os::unix::fs::OpenOptionsExt; -use std::panic; -use std::str::{CharIndices, Chars, Matches}; - -use oxc_diagnostics::Error; -use oxc_span::Span; - -use crate::ast::{ - Alternative, Assertion, BoundaryAssertion, Branch, Character, EdgeAssertion, Element, Pattern, - QuantifiableElement, Quantifier, RegExpLiteral, -}; -use crate::ast_builder::AstBuilder; -use crate::ecma_version::EcmaVersion; - -pub struct Lexer<'a> { - source: &'a str, - /// Regex usually, use a collected `Vec` could reduce lookahead and other util function implementation complexity - chars: Vec, - - pub(crate) errors: Vec, -} - -#[allow(clippy::unused_self)] -impl<'a> Lexer<'a> { - pub fn new(source: &'a str) -> Self { - Self { source, errors: vec![], chars: source.chars().collect::>() } - } -} - -pub struct Parser<'a> { - lexer: Lexer<'a>, - builder: AstBuilder<'a>, - - /// Source Code - source_text: &'a str, - - /// All syntax errors from parser and lexer - /// Note: favor adding to `Diagnostics` instead of raising Err - errors: Vec, - context: ParserContext, - index: usize, - group_names: HashSet, - num_capturing_parens: usize, - last_int_value: usize, - back_reference_names: HashSet, - last_assertion_is_quantifiable: bool, - last_range: Range, -} - -#[derive(Default, Copy, Clone)] -struct ParserContext { - source_kind: SourceKind, - unicode_mode: bool, - nflag: bool, - unicode_sets_mode: bool, - ecma_version: EcmaVersion, - strict: bool, -} - -impl<'a> Parser<'a> { - /// Create a new parser - pub fn new(allocator: &'a oxc_allocator::Allocator, source_text: &'a str) -> Self { - Self { - lexer: Lexer::new(source_text), - source_text, - errors: vec![], - context: ParserContext::default(), - index: 0, - group_names: HashSet::new(), - num_capturing_parens: 0, - back_reference_names: HashSet::new(), - last_int_value: 0, - last_range: 0..0, - last_assertion_is_quantifiable: false, - builder: AstBuilder::new(allocator), - } - } - - pub fn is(&self, ch: char) -> bool { - self.lexer.chars.get(self.index) == Some(&ch) - } - - pub fn eat(&mut self, ch: char) -> bool { - if self.is(ch) { - self.index += 1; - true - } else { - false - } - } - - pub fn eof(&self) -> bool { - self.index < self.lexer.chars.len() - } - - pub fn nth(&self, n: usize) -> Option<&char> { - self.lexer.chars.get(self.index + n) - } - - /// by default next means `next_1` - pub fn next(&self) -> Option<&char> { - self.lexer.chars.get(self.index + 1) - } - - /// get a range chars relative from current cursor - pub fn nrange(&self, range: Range) -> Option<&[char]> { - self.lexer.chars.get(self.index + range.start..(self.index + range.end)) - } - - pub fn current(&self) -> Option<&char> { - self.lexer.chars.get(self.index) - } - - pub fn advance(&mut self) -> bool { - if self.index < self.lexer.chars.len() { - self.index += 1; - return true; - } else { - false - } - } - - pub fn rewind(&mut self, start: usize) { - self.index = start; - } -} - -#[derive(Default, Clone, Copy)] -pub enum SourceKind { - Flags, - #[default] - Literal, - Pattern, -} - -pub fn parse_literal<'a>(parser: &mut Parser<'a>) -> RegExpLiteral<'a> { - if parser.is('/') { - parser.advance(); - let pattern = parse_pattern(parser); - todo!() - } else if parser.source_text.is_empty() { - panic!("Empty") - } else { - match parser.current() { - Some(ch) => { - panic!("unexpected character {ch}") - } - None => { - panic!("unexpected eof") - } - }; - } -} - -fn parse_pattern<'a>(parser: &mut Parser<'a>) -> Pattern<'a> { - let start = parser.index; - if let Some(pattern) = parse_pattern_internal(parser) { - return pattern; - } else if !parser.context.nflag - && parser.context.ecma_version >= EcmaVersion::V2018 - && parser.group_names.len() > 0 - { - parser.rewind(start); - parser.context.nflag = true; - return parse_pattern_internal(parser).expect("should have pattern"); - } - panic!("Invalid pattern") -} - -fn parse_pattern_internal<'a>(parser: &mut Parser<'a>) -> Option> { - let start = parser.index; - parser.num_capturing_parens = count_capturing_parens(parser); - parser.group_names.clear(); - parser.back_reference_names.clear(); - todo!() -} - -fn parse_disjunction<'a>(parser: &mut Parser<'a>) -> oxc_allocator::Vec<'a, Alternative<'a>> { - let start = parser.index; - let mut alternatives = parser.builder.new_vec(); - loop { - alternatives.push(parser_alternative(parser)); - if !parser.eat('|') { - break; - } - } - // Only consume the ast when `no_consume` is false - if parse_quantifier(parser, Some(true)).0 { - panic!("Nothing to repeat"); - } - if parser.eat('{') { - panic!("Lone quantifier brackets") - } - alternatives -} - -/// Validate the next characters as a RegExp `Alternative` production. -/// ``` -/// Alternative[UnicodeMode, UnicodeSetsMode, N]:: -/// [empty] -/// Alternative[?UnicodeMode, ?UnicodeSetsMode, ?N] Term[?UnicodeMode, ?UnicodeSetsMode, ?N] -/// ``` -fn parser_alternative<'a>(parser: &mut Parser<'a>) -> Alternative<'a> { - let start = parser.index; - let mut elements = parser.builder.new_vec(); - while !parser.eof() { - let (flag, node) = parse_term(parser); - if let Some(node) = node { - elements.push(node); - } - if !flag { - break; - } - } - Alternative { span: Span::new(start as u32, parser.index as u32), elements } -} - -fn parse_term<'a>(parser: &mut Parser<'a>) -> (bool, Option>) { - if parser.context.unicode_mode || parser.context.strict {} - todo!() -} - -fn parse_assertion<'a>(parser: &mut Parser<'a>) -> (bool, Option>) { - let start = parser.index; - parser.last_assertion_is_quantifiable = false; - - if parser.eat('^') { - return ( - true, - Some(Assertion::BoundaryAssertion(parser.builder.alloc(BoundaryAssertion::EdgeAssertion( - parser.builder.alloc(EdgeAssertion { - span: Span::new(start as u32, parser.index as u32), - kind: todo!(), - }), - ))), - ); - } - todo!() -} - -/// Validate the next characters as a RegExp `Quantifier` production if possible. -/// ``` -/// Quantifier:: -/// QuantifierPrefix -/// QuantifierPrefix `?` -/// QuantifierPrefix:: -/// `*` -/// `+` -/// `?` -/// `{` DecimalDigits `}` -/// `{` DecimalDigits `,}` -/// `{` DecimalDigits `,` DecimalDigits `}` -/// ``` -/// returns `true` if it consumed the next characters successfully. -fn parse_quantifier<'a>( - parser: &mut Parser<'a>, - no_consume: Option, -) -> (bool, Option>) { - let mut no_consume = no_consume.unwrap_or_default(); - let start = parser.index; - let mut min = 0; - let mut max = 0; - let mut greedy = false; - let mut element = None; - match parser.current().cloned() { - Some('*') => { - min = 0; - max = usize::MAX; - parser.advance(); - } - Some('+') => { - min = 1; - max = usize::MAX; - parser.advance(); - } - Some('?') => { - min = 0; - max = 1; - parser.advance(); - } - Some(_) => { - if parse_braced_quantifier(parser, no_consume) { - min = parser.last_range.start; - max = parser.last_range.end; - } - } - None => return (false, None), - } - greedy = !parser.eat('?'); - - if !no_consume { - let quantifier = parser.builder.alloc(Quantifier { - span: Span { start: start as u32, end: parser.index as u32 }, - min, - max, - greedy, - // https://github.com/eslint-community/regexpp/blob/2e8f1af992fb12eae46a446253e8fa3f6cede92a/src/parser.ts#L269-L275 - // it can't be null, or the program will panic, so we put a dummy element, and parent - // should replace it - element: QuantifiableElement::Character(Character { span: Span::default(), value: 0 }), - }); - - element = Some(Element::Quantifier(quantifier)) - } - (true, element) -} - -fn parse_braced_quantifier<'a>(parser: &mut Parser<'a>, no_error: bool) -> bool { - let start = parser.index; - if eat_decimal_digits(parser) { - let min = parser.last_int_value; - let mut max = min; - if parser.eat(',') { - max = if eat_decimal_digits(parser) { parser.last_int_value } else { usize::MAX }; - } - if parser.eat('}') { - if !no_error && max < min { - panic!("numbers out of order in {{}} quantifier"); - } - parser.last_range = min..max; - return true; - } - } - if !no_error && (parser.context.unicode_mode || parser.context.strict) { - panic!("Incomplete quantifier"); - } - parser.rewind(start); - false -} - -fn eat_decimal_digits<'a>(parser: &mut Parser<'a>) -> bool { - let start = parser.index; - parser.last_int_value = 0; - while let Some(ch) = parser.current() { - let Some(d) = ch.to_digit(10) else { - break; - }; - parser.last_int_value = 10 * parser.last_int_value + d as usize; - parser.advance(); - } - parser.index != start -} - -fn count_capturing_parens<'a>(parser: &mut Parser<'a>) -> usize { - let start = parser.index; - let mut in_class = false; - let mut escaped = false; - let mut count = 0; - while let Some(ch) = parser.current() { - if escaped { - escaped = false; - } - match ch { - '\\' => { - escaped = true; - } - '[' | ']' => { - in_class = false; - } - '(' if !in_class => { - if parser.next() != Some(&'?') - || (parser.nth(2) == Some(&'<') - && !matches!(parser.nth(3), Some(&'=') | Some(&'!'))) - { - count += 1; - } - } - _ => {} - } - parser.advance(); - } - parser.rewind(start); - count -} diff --git a/crates/oxc_js_regex/src/ast.rs b/crates/oxc_js_regex/src/ast.rs index 5d429b4f063b2..ff0f940eb7408 100644 --- a/crates/oxc_js_regex/src/ast.rs +++ b/crates/oxc_js_regex/src/ast.rs @@ -358,7 +358,7 @@ pub struct StringAlternative<'a> { #[derive(Debug)] pub struct Character { pub span: Span, - pub value: u16, // UTF-16 code point + pub value: char, // UTF-16 code point } #[derive(Debug)] diff --git a/crates/oxc_js_regex/src/parser.rs b/crates/oxc_js_regex/src/parser.rs index e7854d063d509..4678ea0d8e710 100644 --- a/crates/oxc_js_regex/src/parser.rs +++ b/crates/oxc_js_regex/src/parser.rs @@ -110,7 +110,7 @@ impl<'a> Parser<'a> { self.lexer.chars.get(self.index + n) } - /// by default next means `next_1` + /// by default next means `nth(1)` pub fn next(&self) -> Option<&char> { self.lexer.chars.get(self.index + 1) } @@ -233,6 +233,11 @@ fn parse_term<'a>(parser: &mut Parser<'a>) -> (bool, Option>) { todo!() } +fn parse_optional_quantifier<'a>(parser: &mut Parser<'a>) -> (bool, Option>) { + let (_, node) = parse_quantifier(parser, None); + (true, node) +} + fn parse_assertion<'a>(parser: &mut Parser<'a>) -> (bool, Option>) { let start = parser.index; parser.last_assertion_is_quantifiable = false; @@ -377,7 +382,10 @@ fn parse_quantifier<'a>( // https://github.com/eslint-community/regexpp/blob/2e8f1af992fb12eae46a446253e8fa3f6cede92a/src/parser.ts#L269-L275 // it can't be null, or the program will panic, so we put a dummy element, and parent // should replace it - element: QuantifiableElement::Character(Character { span: Span::default(), value: 0 }), + element: QuantifiableElement::Character(Character { + span: Span::default(), + value: ' ', + }), }); element = Some(Element::Quantifier(quantifier)) @@ -408,6 +416,447 @@ fn parse_braced_quantifier<'a>(parser: &mut Parser<'a>, no_error: bool) -> bool false } +fn parse_atom<'a>(parser: &mut Parser<'a>) { + todo!() +} + +fn parse_dot<'a>(parser: &mut Parser<'a>) -> (bool, Option) { + let start = parser.index; + if parser.eat('.') { + (true, Some(Character { span: Span::new(start as u32, parser.index as u32), value: '.' })) + } else { + (false, None) + } +} + +fn parse_reverse_solidus_atom_escape<'a>(parser: &mut Parser<'a>) -> bool { + let start = parser.index; + if parser.eat('\\') { + if parse_atom_escape(parser) { + return true; + } + parser.rewind(start); + } + false +} + +fn parse_atom_escape<'a>(parser: &mut Parser<'a>) -> bool { + if parse_backreference(parser) + || parser.consume_character_class_escape() + || parser.consume_character_escape() + || (parser._n_flag && parser.consume_k_group_name()) + { + true + } else { + if parser.strict || parser._unicode_mode { + parser.raise("Invalid escape"); + } + false + } +} + +fn parse_backreference<'a>(parser: &mut Parser<'a>) -> bool { + let start = parser.index; + if parser.eat_decimal_escape() { + let n = parser.last_int_value; + if n <= parser.num_capturing_parens { + parser.on_backreference(start - 1, parser.index, n); + true + } else { + if parser.context.strict || parser.context.unicode_mode { + panic!("Invalid escape"); + } + parser.rewind(start); + } + } else { + false + } +} + +fn consume_character_class_escape<'a>(parser: &mut Parser<'a>) -> Option { + let start = parser.index; + + if parser.eat(LATIN_SMALL_LETTER_D) { + parser.last_int_value = -1; + parser.on_escape_character_set(start - 1, parser.index, "digit", false); + return Some(UnicodeSetsConsumeResult { may_contain_strings: false }); + } + + if parser.eat(LATIN_CAPITAL_LETTER_D) { + parser._last_int_value = -1; + parser.on_escape_character_set(start - 1, parser.index, "digit", true); + return Some(UnicodeSetsConsumeResult { may_contain_strings: false }); + } + + if parser.eat(LATIN_SMALL_LETTER_S) { + parser._last_int_value = -1; + parser.on_escape_character_set(start - 1, parser.index, "space", false); + return Some(UnicodeSetsConsumeResult { may_contain_strings: false }); + } + + if parser.eat(LATIN_CAPITAL_LETTER_S) { + parser._last_int_value = -1; + parser.on_escape_character_set(start - 1, parser.index, "space", true); + return Some(UnicodeSetsConsumeResult { may_contain_strings: false }); + } + + if parser.eat(LATIN_SMALL_LETTER_W) { + parser._last_int_value = -1; + parser.on_escape_character_set(start - 1, parser.index, "word", false); + return Some(UnicodeSetsConsumeResult { may_contain_strings: false }); + } + + if parser.eat(LATIN_CAPITAL_LETTER_W) { + parser._last_int_value = -1; + parser.on_escape_character_set(start - 1, parser.index, "word", true); + return Some(UnicodeSetsConsumeResult { may_contain_strings: false }); + } + + let mut negate = false; + if parser._unicode_mode + && parser.ecma_version >= 2018 + && (parser.eat(LATIN_SMALL_LETTER_P) || (negate = parser.eat(LATIN_CAPITAL_LETTER_P))) + { + parser._last_int_value = -1; + if parser.eat(LEFT_CURLY_BRACKET) { + if let Some(result) = parser.eat_unicode_property_value_expression() { + if parser.eat(RIGHT_CURLY_BRACKET) { + if negate && result.strings.is_some() { + parser.raise("Invalid property name"); + } + + parser.on_unicode_property_character_set( + start - 1, + parser.index, + "property", + &result.key, + &result.value, + negate, + result.strings.unwrap_or(false), + ); + + return Some(UnicodeSetsConsumeResult { + may_contain_strings: result.strings.unwrap_or(false), + }); + } + } + } + panic!("Invalid property name"); + } + + None +} + +fn consume_k_group_name<'a>(parser: &mut Parser<'a>) -> bool { + let start = parser.index; + + if parser.eat(LATIN_SMALL_LETTER_K) { + if parser.eat_group_name() { + let group_name = parser._last_str_value.clone(); + parser._backreference_names.insert(group_name.clone()); + parser.on_backreference(start - 1, parser.index, group_name); + return true; + } + parser.raise("Invalid named reference"); + } + + false +} + +fn consume_character_class<'a>(parser: &mut Parser<'a>) -> Option { + let start = parser.index; + + if parser.eat(LEFT_SQUARE_BRACKET) { + let negate = parser.eat(CIRCUMFLEX_ACCENT); + parser.on_character_class_enter(start, negate, parser._unicode_sets_mode); + let result = parser.consume_class_contents()?; + if !parser.eat(RIGHT_SQUARE_BRACKET) { + if parser.current_code_point == -1 { + parser.raise("Unterminated character class"); + } + parser.raise("Invalid character in character class"); + } + if negate && result.may_contain_strings { + parser.raise("Negated character class may contain strings"); + } + + parser.on_character_class_leave(start, parser.index, negate); + + // * Static Semantics: MayContainStrings + // CharacterClass[UnicodeMode, UnicodeSetsMode] :: + // [ ^ ClassContents[?UnicodeMode, ?UnicodeSetsMode] ] + // 1. Return false. + // CharacterClass :: [ ClassContents ] + // 1. Return MayContainStrings of the ClassContents. + Some(result) + } else { + None + } +} + +/** + * Consume ClassContents in a character class. + * @returns `UnicodeSetsConsumeResult`. + */ +fn consume_class_contents(&mut self) -> UnicodeSetsConsumeResult { + if self._unicode_sets_mode { + if self.current_code_point == RIGHT_SQUARE_BRACKET { + // [empty] + + // * Static Semantics: MayContainStrings + // ClassContents[UnicodeMode, UnicodeSetsMode] :: + // [empty] + // 1. Return false. + return UnicodeSetsConsumeResult { may_contain_strings: false }; + } + let result = self.consume_class_set_expression(); + + // * Static Semantics: MayContainStrings + // ClassContents :: ClassSetExpression + // 1. Return MayContainStrings of the ClassSetExpression. + return result; + } + + let strict = self.strict || self._unicode_mode; + loop { + // Consume the first ClassAtom + let range_start = self.index; + if !self.consume_class_atom() { + break; + } + let min = self._last_int_value; + + // Consume `-` + if !self.eat(HYPHEN_MINUS) { + continue; + } + self.on_character(range_start - 1, self.index, HYPHEN_MINUS); + + // Consume the second ClassAtom + if !self.consume_class_atom() { + break; + } + let max = self._last_int_value; + + // Validate + if min == -1 || max == -1 { + if strict { + self.raise("Invalid character class"); + } + continue; + } + if min > max { + self.raise("Range out of order in character class"); + } + + self.on_character_class_range(range_start, self.index, min, max); + } + + // * Static Semantics: MayContainStrings + // ClassContents[UnicodeMode, UnicodeSetsMode] :: + // NonemptyClassRanges[?UnicodeMode] + // 1. Return false. + return UnicodeSetsConsumeResult { may_contain_strings: false }; +} + +/** + * Consume ClassAtom in a character class. + * @returns `true` if it consumed the next characters successfully. + */ +fn consume_class_atom(&mut self) -> bool { + let start = self.index; + let cp = self.current_code_point; + + if cp != -1 && cp != REVERSE_SOLIDUS && cp != RIGHT_SQUARE_BRACKET { + self.advance(); + self._last_int_value = cp; + self.on_character(start, self.index, self._last_int_value); + return true; + } + + if self.eat(REVERSE_SOLIDUS) { + if self.consume_class_escape() { + return true; + } + if !self.strict && self.current_code_point == LATIN_SMALL_LETTER_C { + self._last_int_value = REVERSE_SOLIDUS; + self.on_character(start, self.index, self._last_int_value); + return true; + } + if self.strict || self._unicode_mode { + self.raise("Invalid escape"); + } + self.rewind(start); + } + + return false; +} + +/** + * Consume ClassEscape in a character class. + * @returns `true` if it consumed the next characters successfully. + */ +fn consume_class_escape(&mut self) -> bool { + let start = self.index; + + // `b` + if self.eat(LATIN_SMALL_LETTER_B) { + self._last_int_value = BACKSPACE; + self.on_character(start - 1, self.index, self._last_int_value); + return true; + } + + // [+UnicodeMode] `-` + if self._unicode_mode && self.eat(HYPHEN_MINUS) { + self._last_int_value = HYPHEN_MINUS; + self.on_character(start - 1, self.index, self._last_int_value); + return true; + } + + // [annexB][~UnicodeMode] `c` ClassControlLetter + let cp = 0; + if !self.strict + && !self._unicode_mode + && self.current_code_point == LATIN_SMALL_LETTER_C + && (is_decimal_digit((cp = self.next_code_point)) || cp == LOW_LINE) + { + self.advance(); + self.advance(); + self._last_int_value = cp % 0x20; + self.on_character(start - 1, self.index, self._last_int_value); + return true; + } + + return self.consume_character_class_escape() || self.consume_character_escape(); +} + +/** + * Consume ClassSetExpression in a character class. + * @returns `UnicodeSetsConsumeResult`. + */ +fn consume_class_set_expression(&mut self) -> UnicodeSetsConsumeResult { + let start = self.index; + let mut may_contain_strings: Option = None; + let mut result: Option = None; + + if self.consume_class_set_character() { + if self.consume_class_set_range_from_operator(start) { + // ClassUnion + self.consume_class_union_right(UnicodeSetsConsumeResult { may_contain_strings: None }); + return UnicodeSetsConsumeResult { may_contain_strings: false }; + } + // ClassSetOperand + + // * Static Semantics: MayContainStrings + // ClassSetOperand :: + // ClassSetCharacter + // 1. Return false. + may_contain_strings = Some(false); + } else if let Some(res) = self.consume_class_set_operand() { + may_contain_strings = Some(res.may_contain_strings); + } else { + let cp = self.current_code_point; + if cp == REVERSE_SOLIDUS { + self.advance(); + self.raise("Invalid escape"); + } + if cp == self.next_code_point && is_class_set_reserved_double_punctuator_character(cp) { + self.raise("Invalid set operation in character class"); + } + self.raise("Invalid character in character class"); + } + + if self.eat2(AMPERSAND, AMPERSAND) { + // ClassIntersection + while self.current_code_point != AMPERSAND + && (result = self.consume_class_set_operand()).is_some() + { + self.on_class_intersection(start, self.index); + if !result.as_ref().unwrap().may_contain_strings.unwrap_or(false) { + may_contain_strings = Some(false); + } + if self.eat2(AMPERSAND, AMPERSAND) { + continue; + } + + // * Static Semantics: MayContainStrings + // ClassSetExpression :: ClassIntersection + // 1. Return MayContainStrings of the ClassIntersection. + // ClassIntersection :: ClassSetOperand && ClassSetOperand + // 1. If MayContainStrings of the first ClassSetOperand is false, return false. + // 2. If MayContainStrings of the second ClassSetOperand is false, return false. + // 3. Return true. + // ClassIntersection :: ClassIntersection && ClassSetOperand + // 1. If MayContainStrings of the ClassIntersection is false, return false. + // 2. If MayContainStrings of the ClassSetOperand is false, return false. + // 3. Return true. + return UnicodeSetsConsumeResult { may_contain_strings }; + } + + self.raise("Invalid character in character class"); + } + if self.eat2(HYPHEN_MINUS, HYPHEN_MINUS) { + // ClassSubtraction + while self.consume_class_set_operand() { + self.on_class_subtraction(start, self.index); + if self.eat2(HYPHEN_MINUS, HYPHEN_MINUS) { + continue; + } + // * Static Semantics: MayContainStrings + // ClassSetExpression :: ClassSubtraction + // 1. Return MayContainStrings of the ClassSubtraction. + // ClassSubtraction :: ClassSetOperand -- ClassSetOperand + // 1. Return MayContainStrings of the first ClassSetOperand. + // ClassSubtraction :: ClassSubtraction -- ClassSetOperand + // 1. Return MayContainStrings of the ClassSubtraction. + return UnicodeSetsConsumeResult { may_contain_strings }; + } + + self.raise("Invalid character in character class"); + } + // ClassUnion + return self.consume_class_union_right(UnicodeSetsConsumeResult { may_contain_strings }); +} + +/** + * Consume the right operand of a ClassUnion in a character class. + * @param left_result The result information for the left ClassSetRange or ClassSetOperand. + * @returns `UnicodeSetsConsumeResult`. + */ +fn consume_class_union_right( + &mut self, + left_result: UnicodeSetsConsumeResult, +) -> UnicodeSetsConsumeResult { + // ClassUnion + let mut may_contain_strings = left_result.may_contain_strings.unwrap_or(false); + loop { + let start = self.index; + if self.consume_class_set_character() { + self.consume_class_set_range_from_operator(start); + continue; + } + if let Some(result) = self.consume_class_set_operand() { + if result.may_contain_strings.unwrap_or(false) { + may_contain_strings = true; + } + continue; + } + break; + } + + // * Static Semantics: MayContainStrings + // ClassSetExpression :: ClassUnion + // 1. Return MayContainStrings of the ClassUnion. + // ClassUnion :: ClassSetRange ClassUnion(opt) + // 1. If the ClassUnion is present, return MayContainStrings of the ClassUnion. + // 2. Return false. + // ClassUnion :: ClassSetOperand ClassUnion(opt) + // 1. If MayContainStrings of the ClassSetOperand is true, return true. + // 2. If ClassUnion is present, return MayContainStrings of the ClassUnion. + // 3. Return false. + return UnicodeSetsConsumeResult { may_contain_strings }; +} + fn eat_decimal_digits<'a>(parser: &mut Parser<'a>) -> bool { let start = parser.index; parser.last_int_value = 0; @@ -438,7 +887,7 @@ fn count_capturing_parens<'a>(parser: &mut Parser<'a>) -> usize { in_class = false; } '(' if !in_class => { - if parser.next() != Some(&'?') + if parser.nth(1) != Some(&'?') || (parser.nth(2) == Some(&'<') && !matches!(parser.nth(3), Some(&'=') | Some(&'!'))) { @@ -452,3 +901,668 @@ fn count_capturing_parens<'a>(parser: &mut Parser<'a>) -> usize { parser.rewind(start); count } + +/** + * Consume NestedClass in a character class. + * @returns `UnicodeSetsConsumeResult`. + */ +fn consume_nested_class(&mut self) -> Option { + let start = self.index; + if self.eat(LEFT_SQUARE_BRACKET) { + let negate = self.eat(CIRCUMFLEX_ACCENT); + self.on_character_class_enter(start, negate, true); + let result = self.consume_class_contents(); + if !self.eat(RIGHT_SQUARE_BRACKET) { + self.raise("Unterminated character class"); + } + if negate && result.may_contain_strings.unwrap_or(false) { + self.raise("Negated character class may contain strings"); + } + self.on_character_class_leave(start, self.index, negate); + + // * Static Semantics: MayContainStrings + // NestedClass :: + // [ ^ ClassContents[+UnicodeMode, +UnicodeSetsMode] ] + // 1. Return false. + // NestedClass :: [ ClassContents ] + // 1. Return MayContainStrings of the ClassContents. + return Some(result); + } + if self.eat(REVERSE_SOLIDUS) { + if let Some(result) = self.consume_character_class_escape() { + // * Static Semantics: MayContainStrings + // NestedClass :: \ CharacterClassEscape + // 1. Return MayContainStrings of the CharacterClassEscape. + return Some(result); + } + self.rewind(start); + } + None +} + +/** + * Consume ClassStringDisjunction in a character class. + * @returns `UnicodeSetsConsumeResult`. + */ +fn consume_class_string_disjunction(&mut self) -> Option { + let start = self.index; + if self.eat3(REVERSE_SOLIDUS, LATIN_SMALL_LETTER_Q, LEFT_CURLY_BRACKET) { + self.on_class_string_disjunction_enter(start); + + let mut i = 0; + let mut may_contain_strings = false; + while self.consume_class_string(i).may_contain_strings.unwrap_or(false) { + may_contain_strings = true; + i += 1; + if !self.eat(VERTICAL_LINE) { + break; + } + } + + if self.eat(RIGHT_CURLY_BRACKET) { + self.on_class_string_disjunction_leave(start, self.index); + + // * Static Semantics: MayContainStrings + // ClassStringDisjunction :: \q{ ClassStringDisjunctionContents } + // 1. Return MayContainStrings of the ClassStringDisjunctionContents. + // ClassStringDisjunctionContents :: ClassString + // 1. Return MayContainStrings of the ClassString. + // ClassStringDisjunctionContents :: ClassString | ClassStringDisjunctionContents + // 1. If MayContainStrings of the ClassString is true, return true. + // 2. Return MayContainStrings of the ClassStringDisjunctionContents. + return Some(UnicodeSetsConsumeResult { may_contain_strings }); + } + self.raise("Unterminated class string disjunction"); + } + None +} + +/** + * Consume ClassString in a character class. + * @param i - The index of the string alternative. + * @returns `UnicodeSetsConsumeResult`. + */ +fn consume_class_string(&mut self, i: usize) -> UnicodeSetsConsumeResult { + let start = self.index; + + let mut count = 0; + self.on_string_alternative_enter(start, i); + + while self.current_code_point != -1 && self.consume_class_set_character() { + count += 1; + } + + self.on_string_alternative_leave(start, self.index, i); + + // * Static Semantics: MayContainStrings + // ClassString :: [empty] + // 1. Return true. + // ClassString :: NonEmptyClassString + // 1. Return MayContainStrings of the NonEmptyClassString. + // NonEmptyClassString :: ClassSetCharacter NonEmptyClassString(opt) + // 1. If NonEmptyClassString is present, return true. + // 2. Return false. + return UnicodeSetsConsumeResult { may_contain_strings: Some(count != 1) }; +} + +/** + * Consume ClassSetCharacter in a character class. + * Set `self._last_int_value` if it consumed the next characters successfully. + * @returns `true` if it ate the next characters successfully. + */ +fn consume_class_set_character(&mut self) -> bool { + let start = self.index; + let cp = self.current_code_point; + + if cp != -1 && cp != self.next_code_point + || !is_class_set_reserved_double_punctuator_character(cp) + { + if cp != -1 && !is_class_set_syntax_character(cp) { + self._last_int_value = cp; + self.advance(); + self.on_character(start, self.index, self._last_int_value); + return true; + } + } + + if self.eat(REVERSE_SOLIDUS) { + if self.consume_character_escape() { + return true; + } + if is_class_set_reserved_punctuator(self.current_code_point) { + self._last_int_value = self.current_code_point; + self.advance(); + self.on_character(start, self.index, self._last_int_value); + return true; + } + if self.eat(LATIN_SMALL_LETTER_B) { + self._last_int_value = BACKSPACE; + self.on_character(start, self.index, self._last_int_value); + return true; + } + self.rewind(start); + } + + false +} + +/** + * Eat the next characters as a RegExp `GroupName` production if possible. + * Set `self._last_str_value` if the group name existed. + * @returns `true` if it ate the next characters successfully. + */ +fn eat_group_name(&mut self) -> bool { + if self.eat(LESS_THAN_SIGN) { + if self.eat_reg_exp_identifier_name() && self.eat(GREATER_THAN_SIGN) { + return true; + } + self.raise("Invalid capture group name"); + } + false +} + +/** + * Eat the next characters as a RegExp `RegExpIdentifierName` production if + * possible. + * Set `self._last_str_value` if the identifier name existed. + * @returns `true` if it ate the next characters successfully. + */ +fn eat_reg_exp_identifier_name(&mut self) -> bool { + if self.eat_reg_exp_identifier_start() { + self._last_str_value = self._last_int_value.to_string(); + + while self.eat_reg_exp_identifier_part() { + self._last_str_value.push_str(&self._last_int_value.to_string()); + } + + return true; + } + false +} + +/** + * Eat the next characters as a RegExp `RegExpIdentifierStart` production if + * possible. + * Set `self._last_int_value` if the identifier start existed. + * @returns `true` if it ate the next characters successfully. + */ +fn eat_reg_exp_identifier_start(&mut self) -> bool { + let start = self.index; + let force_u_flag = !self._unicode_mode && self.ecma_version >= 2020; + let mut cp = self.current_code_point; + self.advance(); + + if cp == REVERSE_SOLIDUS && self.eat_reg_exp_unicode_escape_sequence(force_u_flag) { + cp = self._last_int_value; + } else if force_u_flag && is_lead_surrogate(cp) && is_trail_surrogate(self.current_code_point) { + cp = combine_surrogate_pair(cp, self.current_code_point); + self.advance(); + } + + if is_identifier_start_char(cp) { + self._last_int_value = cp; + return true; + } + + if self.index != start { + self.rewind(start); + } + false +} + +/** + * Eat the next characters as a RegExp `RegExpIdentifierPart` production if possible. + * Set `self._last_int_value` if it ate the next characters successfully. + * ``` + * RegExpIdentifierPart[UnicodeMode]:: + * RegExpIdentifierStart[?UnicodeMode] + * DecimalDigit + * \ UnicodeEscapeSequence[+UnicodeMode] + * ``` + * @returns `true` if it ate the next characters successfully. + */ +fn eat_reg_exp_identifier_part(&mut self) -> bool { + let start = self.index; + let force_u_flag = !self._unicode_mode && self.ecma_version >= 2020; + let mut cp = self.current_code_point; + self.advance(); + + if cp == REVERSE_SOLIDUS && self.eat_reg_exp_unicode_escape_sequence(force_u_flag) { + cp = self._last_int_value; + } else if force_u_flag && is_lead_surrogate(cp) && is_trail_surrogate(self.current_code_point) { + cp = combine_surrogate_pair(cp, self.current_code_point); + self.advance(); + } + + if is_identifier_part_char(cp) { + self._last_int_value = cp; + return true; + } + + if self.index != start { + self.rewind(start); + } + false +} + +/** + * Eat the next characters as the following alternatives if possible. + * Set `self._last_int_value` if it ate the next characters successfully. + * ``` + * `c` ControlLetter + * ``` + * @returns `true` if it ate the next characters successfully. + */ +fn eat_c_control_letter(&mut self) -> bool { + let start = self.index; + if self.eat(LATIN_SMALL_LETTER_C) { + if self.eat_control_letter() { + return true; + } + self.rewind(start); + } + false +} + +/** + * Eat the next characters as the following alternatives if possible. + * Set `self._last_int_value` if it ate the next characters successfully. + * ``` + * `0` [lookahead ∉ DecimalDigit] + * ``` + * @returns `true` if it ate the next characters successfully. + */ +fn eat_zero(&mut self) -> bool { + if self.current_code_point == DIGIT_ZERO && !is_decimal_digit(self.next_code_point) { + self._last_int_value = 0; + self.advance(); + return true; + } + false +} + +/** + * Eat the next characters as a RegExp `ControlEscape` production if + * possible. + * Set `self._last_int_value` if it ate the next characters successfully. + * ``` + * ControlEscape:: one of + * f n r t v + * ``` + * @returns `true` if it ate the next characters successfully. + */ +fn eat_control_escape(&mut self) -> bool { + if self.eat(LATIN_SMALL_LETTER_F) { + self._last_int_value = FORM_FEED; + return true; + } + if self.eat(LATIN_SMALL_LETTER_N) { + self._last_int_value = LINE_FEED; + return true; + } + if self.eat(LATIN_SMALL_LETTER_R) { + self._last_int_value = CARRIAGE_RETURN; + return true; + } + if self.eat(LATIN_SMALL_LETTER_T) { + self._last_int_value = CHARACTER_TABULATION; + return true; + } + if self.eat(LATIN_SMALL_LETTER_V) { + self._last_int_value = LINE_TABULATION; + return true; + } + false +} + +/** + * Eat the next characters as a RegExp `ControlLetter` production if + * possible. + * Set `self._last_int_value` if it ate the next characters successfully. + * ``` + * ControlLetter:: one of + * a b c d e f g h i j k l m n o p q r s t u v w x y z + * A B C D E F G H I J K L M N O P Q R S T U V W X Y Z + * ``` + * @returns `true` if it ate the next characters successfully. + */ +fn eat_control_letter(&mut self) -> bool { + let cp = self.current_code_point; + if is_latin_letter(cp) { + self.advance(); + self._last_int_value = cp % 0x20; + return true; + } + false +} + +/** + * Eat the next characters as a RegExp `RegExpUnicodeEscapeSequence` + * production if possible. + * Set `self._last_int_value` if it ate the next characters successfully. + * ``` + * RegExpUnicodeEscapeSequence[UnicodeMode]:: + * [+UnicodeMode] `u` HexLeadSurrogate `\u` HexTrailSurrogate + * [+UnicodeMode] `u` HexLeadSurrogate + * [+UnicodeMode] `u` HexTrailSurrogate + * [+UnicodeMode] `u` HexNonSurrogate + * [~UnicodeMode] `u` Hex4Digits + * [+UnicodeMode] `u{` CodePoint `}` + * ``` + * @returns `true` if it ate the next characters successfully. + */ +fn eat_reg_exp_unicode_escape_sequence(&mut self, force_u_flag: bool) -> bool { + let start = self.index; + let u_flag = force_u_flag || self._unicode_mode; + + if self.eat(LATIN_SMALL_LETTER_U) { + if (u_flag && self.eat_reg_exp_unicode_surrogate_pair_escape()) + || self.eat_fixed_hex_digits(4) + || (u_flag && self.eat_reg_exp_unicode_code_point_escape()) + { + return true; + } + if self.strict || u_flag { + self.raise("Invalid unicode escape"); + } + self.rewind(start); + } + + false +} + +/** + * Eat the next characters as the following alternatives if possible. + * Set `self._last_int_value` if it ate the next characters successfully. + * ``` + * HexLeadSurrogate `\u` HexTrailSurrogate + * ``` + * @returns `true` if it ate the next characters successfully. + */ +fn eat_reg_exp_unicode_surrogate_pair_escape(&mut self) -> bool { + let start = self.index; + + if self.eat_fixed_hex_digits(4) { + let lead = self._last_int_value; + if is_lead_surrogate(lead) + && self.eat(REVERSE_SOLIDUS) + && self.eat(LATIN_SMALL_LETTER_U) + && self.eat_fixed_hex_digits(4) + { + let trail = self._last_int_value; + if is_trail_surrogate(trail) { + self._last_int_value = combine_surrogate_pair(lead, trail); + return true; + } + } + + self.rewind(start); + } + + false +} + +/** + * Eat the next characters as the following alternatives if possible. + * Set `self._last_int_value` if it ate the next characters successfully. + * ``` + * `{` CodePoint `}` + * ``` + * @returns `true` if it ate the next characters successfully. + */ +fn eat_reg_exp_unicode_code_point_escape(&mut self) -> bool { + let start = self.index; + + if self.eat(LEFT_CURLY_BRACKET) + && self.eat_hex_digits() + && self.eat(RIGHT_CURLY_BRACKET) + && is_valid_unicode(self._last_int_value) + { + return true; + } + + self.rewind(start); + false +} + +/** + * Eat the next characters as a RegExp `IdentityEscape` production if + * possible. + * Set `self._last_int_value` if it ate the next characters successfully. + * ``` + * IdentityEscape[UnicodeMode, N]:: + * [+UnicodeMode] SyntaxCharacter + * [+UnicodeMode] `/` + * [strict][~UnicodeMode] SourceCharacter but not UnicodeIDContinue + * [annexB][~UnicodeMode] SourceCharacterIdentityEscape[?N] + * SourceCharacterIdentityEscape[N]:: + * [~N] SourceCharacter but not c + * [+N] SourceCharacter but not one of c k + * ``` + * @returns `true` if it ate the next characters successfully. + */ +fn eat_identity_escape(&mut self) -> bool { + let cp = self.current_code_point; + if self.is_valid_identity_escape(cp) { + self._last_int_value = cp; + self.advance(); + return true; + } + false +} + +fn is_valid_identity_escape(&self, cp: i32) -> bool { + if cp == -1 { + return false; + } + if self._unicode_mode { + return is_syntax_character(cp) || cp == SOLIDUS; + } + if self.strict { + return !is_id_continue(cp); + } + if self._n_flag { + return !(cp == LATIN_SMALL_LETTER_C || cp == LATIN_SMALL_LETTER_K); + } + cp != LATIN_SMALL_LETTER_C +} + +/** + * Eat the next characters as a RegExp `DecimalEscape` production if + * possible. + * Set `self._last_int_value` if it ate the next characters successfully. + * ``` + * DecimalEscape:: + * NonZeroDigit DecimalDigits(opt) [lookahead ∉ DecimalDigit] + * ``` + * @returns `true` if it ate the next characters successfully. + */ +fn eat_decimal_escape(&mut self) -> bool { + self._last_int_value = 0; + let mut cp = self.current_code_point; + if cp >= DIGIT_ONE && cp <= DIGIT_NINE { + while cp >= DIGIT_ZERO && cp <= DIGIT_NINE { + self._last_int_value = 10 * self._last_int_value + (cp - DIGIT_ZERO); + self.advance(); + cp = self.current_code_point; + } + return true; + } + false +} + +/** + * Eat the next characters as a RegExp `ControlLetter` production if + * possible. + * Set `self._last_int_value` if it ate the next characters successfully. + * ``` + * ControlLetter:: one of + * a b c d e f g h i j k l m n o p q r s t u v w x y z + * A B C D E F G H I J K L M N O P Q R S T U V W X Y Z + * ``` + * @returns `true` if it ate the next characters successfully. + */ +fn eat_control_letter(&mut self) -> bool { + let cp = self.current_code_point; + if is_latin_letter(cp) { + self.advance(); + self._last_int_value = cp % 0x20; + return true; + } + false +} + +/** + * Eat the next characters as a RegExp `RegExpUnicodeEscapeSequence` + * production if possible. + * Set `self._last_int_value` if it ate the next characters successfully. + * ``` + * RegExpUnicodeEscapeSequence[UnicodeMode]:: + * [+UnicodeMode] `u` HexLeadSurrogate `\u` HexTrailSurrogate + * [+UnicodeMode] `u` HexLeadSurrogate + * [+UnicodeMode] `u` HexTrailSurrogate + * [+UnicodeMode] `u` HexNonSurrogate + * [~UnicodeMode] `u` Hex4Digits + * [+UnicodeMode] `u{` CodePoint `}` + * ``` + * @returns `true` if it ate the next characters successfully. + */ +fn eat_reg_exp_unicode_escape_sequence(&mut self, force_u_flag: bool) -> bool { + let start = self.index; + let u_flag = force_u_flag || self._unicode_mode; + + if self.eat(LATIN_SMALL_LETTER_U) { + if (u_flag && self.eat_reg_exp_unicode_surrogate_pair_escape()) + || self.eat_fixed_hex_digits(4) + || (u_flag && self.eat_reg_exp_unicode_code_point_escape()) + { + return true; + } + if self.strict || u_flag { + self.raise("Invalid unicode escape"); + } + self.rewind(start); + } + + false +} + +/** + * Eat the next characters as the following alternatives if possible. + * Set `self._last_int_value` if it ate the next characters successfully. + * ``` + * HexLeadSurrogate `\u` HexTrailSurrogate + * ``` + * @returns `true` if it ate the next characters successfully. + */ +fn eat_reg_exp_unicode_surrogate_pair_escape(&mut self) -> bool { + let start = self.index; + + if self.eat_fixed_hex_digits(4) { + let lead = self._last_int_value; + if is_lead_surrogate(lead) + && self.eat(REVERSE_SOLIDUS) + && self.eat(LATIN_SMALL_LETTER_U) + && self.eat_fixed_hex_digits(4) + { + let trail = self._last_int_value; + if is_trail_surrogate(trail) { + self._last_int_value = combine_surrogate_pair(lead, trail); + return true; + } + } + + self.rewind(start); + } + + false +} + +/** + * Eat the next characters as the following alternatives if possible. + * Set `self._last_int_value` if it ate the next characters successfully. + * ``` + * `{` CodePoint `}` + * ``` + * @returns `true` if it ate the next characters successfully. + */ +fn eat_reg_exp_unicode_code_point_escape(&mut self) -> bool { + let start = self.index; + + if self.eat(LEFT_CURLY_BRACKET) + && self.eat_hex_digits() + && self.eat(RIGHT_CURLY_BRACKET) + && is_valid_unicode(self._last_int_value) + { + return true; + } + + self.rewind(start); + false +} + +/** + * Eat the next characters as a RegExp `IdentityEscape` production if + * possible. + * Set `self._last_int_value` if it ate the next characters successfully. + * ``` + * IdentityEscape[UnicodeMode, N]:: + * [+UnicodeMode] SyntaxCharacter + * [+UnicodeMode] `/` + * [strict][~UnicodeMode] SourceCharacter but not UnicodeIDContinue + * [annexB][~UnicodeMode] SourceCharacterIdentityEscape[?N] + * SourceCharacterIdentityEscape[N]:: + * [~N] SourceCharacter but not c + * [+N] SourceCharacter but not one of c k + * ``` + * @returns `true` if it ate the next characters successfully. + */ +fn eat_identity_escape(&mut self) -> bool { + let cp = self.current_code_point; + if self.is_valid_identity_escape(cp) { + self._last_int_value = cp; + self.advance(); + return true; + } + false +} + +fn is_valid_identity_escape(&self, cp: i32) -> bool { + if cp == -1 { + return false; + } + if self._unicode_mode { + return is_syntax_character(cp) || cp == SOLIDUS; + } + if self.strict { + return !is_id_continue(cp); + } + if self._n_flag { + return !(cp == LATIN_SMALL_LETTER_C || cp == LATIN_SMALL_LETTER_K); + } + cp != LATIN_SMALL_LETTER_C +} + +/** + * Eat the next characters as a RegExp `DecimalEscape` production if + * possible. + * Set `self._last_int_value` if it ate the next characters successfully. + * ``` + * DecimalEscape:: + * NonZeroDigit DecimalDigits(opt) [lookahead ∉ DecimalDigit] + * ``` + * @returns `true` if it ate the next characters successfully. + */ +fn eat_decimal_escape(&mut self) -> bool { + self._last_int_value = 0; + let mut cp = self.current_code_point; + if cp >= DIGIT_ONE && cp <= DIGIT_NINE { + while cp >= DIGIT_ZERO && cp <= DIGIT_NINE { + self._last_int_value = 10 * self._last_int_value + (cp - DIGIT_ZERO); + self.advance(); + cp = self.current_code_point; + } + return true; + } + false +} From 15582e874d83e81ec6d0d0400fb7147719cce6a0 Mon Sep 17 00:00:00 2001 From: IWANABETHATGUY Date: Sun, 14 Jan 2024 21:58:49 +0800 Subject: [PATCH 09/19] =?UTF-8?q?fix:=20=F0=9F=90=9B=20syntax=20err?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/oxc_js_regex/src/parser.rs | 87 ++++++++++++++++--------------- 1 file changed, 45 insertions(+), 42 deletions(-) diff --git a/crates/oxc_js_regex/src/parser.rs b/crates/oxc_js_regex/src/parser.rs index 4678ea0d8e710..c61291b9f6a1d 100644 --- a/crates/oxc_js_regex/src/parser.rs +++ b/crates/oxc_js_regex/src/parser.rs @@ -133,7 +133,7 @@ impl<'a> Parser<'a> { } } - pub fn rewind(&mut self, start: usize) { + pub fn rewind<'a>(parser: &mut Parser<'a>, start: usize) { self.index = start; } } @@ -444,7 +444,7 @@ fn parse_atom_escape<'a>(parser: &mut Parser<'a>) -> bool { if parse_backreference(parser) || parser.consume_character_class_escape() || parser.consume_character_escape() - || (parser._n_flag && parser.consume_k_group_name()) + || (parser.context.nflag && parser.consume_k_group_name()) { true } else { @@ -467,6 +467,7 @@ fn parse_backreference<'a>(parser: &mut Parser<'a>) -> bool { panic!("Invalid escape"); } parser.rewind(start); + true } } else { false @@ -598,7 +599,7 @@ fn consume_character_class<'a>(parser: &mut Parser<'a>) -> Option UnicodeSetsConsumeResult { +fn consume_class_contents<'a>(parser: &mut Parser<'a>) -> UnicodeSetsConsumeResult { if self._unicode_sets_mode { if self.current_code_point == RIGHT_SQUARE_BRACKET { // [empty] @@ -663,7 +664,7 @@ fn consume_class_contents(&mut self) -> UnicodeSetsConsumeResult { * Consume ClassAtom in a character class. * @returns `true` if it consumed the next characters successfully. */ -fn consume_class_atom(&mut self) -> bool { +fn consume_class_atom<'a>(parser: &mut Parser<'a>) -> bool { let start = self.index; let cp = self.current_code_point; @@ -696,7 +697,7 @@ fn consume_class_atom(&mut self) -> bool { * Consume ClassEscape in a character class. * @returns `true` if it consumed the next characters successfully. */ -fn consume_class_escape(&mut self) -> bool { +fn consume_class_escape<'a>(parser: &mut Parser<'a>) -> bool { let start = self.index; // `b` @@ -734,7 +735,7 @@ fn consume_class_escape(&mut self) -> bool { * Consume ClassSetExpression in a character class. * @returns `UnicodeSetsConsumeResult`. */ -fn consume_class_set_expression(&mut self) -> UnicodeSetsConsumeResult { +fn consume_class_set_expression<'a>(parser: &mut Parser<'a>) -> UnicodeSetsConsumeResult { let start = self.index; let mut may_contain_strings: Option = None; let mut result: Option = None; @@ -823,8 +824,8 @@ fn consume_class_set_expression(&mut self) -> UnicodeSetsConsumeResult { * @param left_result The result information for the left ClassSetRange or ClassSetOperand. * @returns `UnicodeSetsConsumeResult`. */ -fn consume_class_union_right( - &mut self, +fn consume_class_union_right<'a>( + parser: &mut Parser<'a>, left_result: UnicodeSetsConsumeResult, ) -> UnicodeSetsConsumeResult { // ClassUnion @@ -906,7 +907,7 @@ fn count_capturing_parens<'a>(parser: &mut Parser<'a>) -> usize { * Consume NestedClass in a character class. * @returns `UnicodeSetsConsumeResult`. */ -fn consume_nested_class(&mut self) -> Option { +fn consume_nested_class<'a>(parser: &mut Parser<'a>) -> Option { let start = self.index; if self.eat(LEFT_SQUARE_BRACKET) { let negate = self.eat(CIRCUMFLEX_ACCENT); @@ -944,7 +945,9 @@ fn consume_nested_class(&mut self) -> Option { * Consume ClassStringDisjunction in a character class. * @returns `UnicodeSetsConsumeResult`. */ -fn consume_class_string_disjunction(&mut self) -> Option { +fn consume_class_string_disjunction<'a>( + parser: &mut Parser<'a>, +) -> Option { let start = self.index; if self.eat3(REVERSE_SOLIDUS, LATIN_SMALL_LETTER_Q, LEFT_CURLY_BRACKET) { self.on_class_string_disjunction_enter(start); @@ -982,7 +985,7 @@ fn consume_class_string_disjunction(&mut self) -> Option UnicodeSetsConsumeResult { +fn consume_class_string<'a>(parser: &mut Parser<'a>, i: usize) -> UnicodeSetsConsumeResult { let start = self.index; let mut count = 0; @@ -1010,7 +1013,7 @@ fn consume_class_string(&mut self, i: usize) -> UnicodeSetsConsumeResult { * Set `self._last_int_value` if it consumed the next characters successfully. * @returns `true` if it ate the next characters successfully. */ -fn consume_class_set_character(&mut self) -> bool { +fn consume_class_set_character<'a>(parser: &mut Parser<'a>) -> bool { let start = self.index; let cp = self.current_code_point; @@ -1051,7 +1054,7 @@ fn consume_class_set_character(&mut self) -> bool { * Set `self._last_str_value` if the group name existed. * @returns `true` if it ate the next characters successfully. */ -fn eat_group_name(&mut self) -> bool { +fn eat_group_name<'a>(parser: &mut Parser<'a>) -> bool { if self.eat(LESS_THAN_SIGN) { if self.eat_reg_exp_identifier_name() && self.eat(GREATER_THAN_SIGN) { return true; @@ -1067,7 +1070,7 @@ fn eat_group_name(&mut self) -> bool { * Set `self._last_str_value` if the identifier name existed. * @returns `true` if it ate the next characters successfully. */ -fn eat_reg_exp_identifier_name(&mut self) -> bool { +fn eat_reg_exp_identifier_name<'a>(parser: &mut Parser<'a>) -> bool { if self.eat_reg_exp_identifier_start() { self._last_str_value = self._last_int_value.to_string(); @@ -1086,7 +1089,7 @@ fn eat_reg_exp_identifier_name(&mut self) -> bool { * Set `self._last_int_value` if the identifier start existed. * @returns `true` if it ate the next characters successfully. */ -fn eat_reg_exp_identifier_start(&mut self) -> bool { +fn eat_reg_exp_identifier_start<'a>(parser: &mut Parser<'a>) -> bool { let start = self.index; let force_u_flag = !self._unicode_mode && self.ecma_version >= 2020; let mut cp = self.current_code_point; @@ -1121,7 +1124,7 @@ fn eat_reg_exp_identifier_start(&mut self) -> bool { * ``` * @returns `true` if it ate the next characters successfully. */ -fn eat_reg_exp_identifier_part(&mut self) -> bool { +fn eat_reg_exp_identifier_part<'a>(parser: &mut Parser<'a>) -> bool { let start = self.index; let force_u_flag = !self._unicode_mode && self.ecma_version >= 2020; let mut cp = self.current_code_point; @@ -1153,7 +1156,7 @@ fn eat_reg_exp_identifier_part(&mut self) -> bool { * ``` * @returns `true` if it ate the next characters successfully. */ -fn eat_c_control_letter(&mut self) -> bool { +fn eat_c_control_letter<'a>(parser: &mut Parser<'a>) -> bool { let start = self.index; if self.eat(LATIN_SMALL_LETTER_C) { if self.eat_control_letter() { @@ -1172,7 +1175,7 @@ fn eat_c_control_letter(&mut self) -> bool { * ``` * @returns `true` if it ate the next characters successfully. */ -fn eat_zero(&mut self) -> bool { +fn eat_zero<'a>(parser: &mut Parser<'a>) -> bool { if self.current_code_point == DIGIT_ZERO && !is_decimal_digit(self.next_code_point) { self._last_int_value = 0; self.advance(); @@ -1191,7 +1194,7 @@ fn eat_zero(&mut self) -> bool { * ``` * @returns `true` if it ate the next characters successfully. */ -fn eat_control_escape(&mut self) -> bool { +fn eat_control_escape<'a>(parser: &mut Parser<'a>) -> bool { if self.eat(LATIN_SMALL_LETTER_F) { self._last_int_value = FORM_FEED; return true; @@ -1226,7 +1229,7 @@ fn eat_control_escape(&mut self) -> bool { * ``` * @returns `true` if it ate the next characters successfully. */ -fn eat_control_letter(&mut self) -> bool { +fn eat_control_letter<'a>(parser: &mut Parser<'a>) -> bool { let cp = self.current_code_point; if is_latin_letter(cp) { self.advance(); @@ -1251,7 +1254,7 @@ fn eat_control_letter(&mut self) -> bool { * ``` * @returns `true` if it ate the next characters successfully. */ -fn eat_reg_exp_unicode_escape_sequence(&mut self, force_u_flag: bool) -> bool { +fn eat_reg_exp_unicode_escape_sequence<'a>(parser: &mut Parser<'a>, force_u_flag: bool) -> bool { let start = self.index; let u_flag = force_u_flag || self._unicode_mode; @@ -1279,7 +1282,7 @@ fn eat_reg_exp_unicode_escape_sequence(&mut self, force_u_flag: bool) -> bool { * ``` * @returns `true` if it ate the next characters successfully. */ -fn eat_reg_exp_unicode_surrogate_pair_escape(&mut self) -> bool { +fn eat_reg_exp_unicode_surrogate_pair_escape<'a>(parser: &mut Parser<'a>) -> bool { let start = self.index; if self.eat_fixed_hex_digits(4) { @@ -1310,7 +1313,7 @@ fn eat_reg_exp_unicode_surrogate_pair_escape(&mut self) -> bool { * ``` * @returns `true` if it ate the next characters successfully. */ -fn eat_reg_exp_unicode_code_point_escape(&mut self) -> bool { +fn eat_reg_exp_unicode_code_point_escape(parser: &mut Parser<'a>) -> bool { let start = self.index; if self.eat(LEFT_CURLY_BRACKET) @@ -1341,7 +1344,7 @@ fn eat_reg_exp_unicode_code_point_escape(&mut self) -> bool { * ``` * @returns `true` if it ate the next characters successfully. */ -fn eat_identity_escape(&mut self) -> bool { +fn eat_identity_escape<'a>(parser: &mut Parser<'a>) -> bool { let cp = self.current_code_point; if self.is_valid_identity_escape(cp) { self._last_int_value = cp; @@ -1351,7 +1354,7 @@ fn eat_identity_escape(&mut self) -> bool { false } -fn is_valid_identity_escape(&self, cp: i32) -> bool { +fn is_valid_identity_escape<'a>(parser: &mut Parser<'a>, cp: i32) -> bool { if cp == -1 { return false; } @@ -1377,14 +1380,14 @@ fn is_valid_identity_escape(&self, cp: i32) -> bool { * ``` * @returns `true` if it ate the next characters successfully. */ -fn eat_decimal_escape(&mut self) -> bool { - self._last_int_value = 0; - let mut cp = self.current_code_point; - if cp >= DIGIT_ONE && cp <= DIGIT_NINE { - while cp >= DIGIT_ZERO && cp <= DIGIT_NINE { - self._last_int_value = 10 * self._last_int_value + (cp - DIGIT_ZERO); - self.advance(); - cp = self.current_code_point; +fn eat_decimal_escape<'a>(parser: &mut Parser<'a>) -> bool { + parser.last_int_value = 0; + let mut cp = parser.current(); + if cp >= Some(&'1') && cp <= Some(&'9') { + while cp >= Some(&'1') && cp <= Some(&'9') { + parser.last_int_value = 10 * parser.last_int_value + (cp - DIGIT_ZERO); + parser.advance(); + cp = parser.current(); } return true; } @@ -1402,11 +1405,11 @@ fn eat_decimal_escape(&mut self) -> bool { * ``` * @returns `true` if it ate the next characters successfully. */ -fn eat_control_letter(&mut self) -> bool { - let cp = self.current_code_point; +fn eat_control_letter<'a>(parser: &mut Parser<'a>) -> bool { + let cp = parser.current(); if is_latin_letter(cp) { - self.advance(); - self._last_int_value = cp % 0x20; + parser.advance(); + parser.last_int_value = cp % 0x20; return true; } false @@ -1427,7 +1430,7 @@ fn eat_control_letter(&mut self) -> bool { * ``` * @returns `true` if it ate the next characters successfully. */ -fn eat_reg_exp_unicode_escape_sequence(&mut self, force_u_flag: bool) -> bool { +fn eat_reg_exp_unicode_escape_sequence<'a>(parser: &mut Parser<'a>, force_u_flag: bool) -> bool { let start = self.index; let u_flag = force_u_flag || self._unicode_mode; @@ -1455,7 +1458,7 @@ fn eat_reg_exp_unicode_escape_sequence(&mut self, force_u_flag: bool) -> bool { * ``` * @returns `true` if it ate the next characters successfully. */ -fn eat_reg_exp_unicode_surrogate_pair_escape(&mut self) -> bool { +fn eat_reg_exp_unicode_surrogate_pair_escape<'a>(parser: &mut Parser<'a>) -> bool { let start = self.index; if self.eat_fixed_hex_digits(4) { @@ -1486,7 +1489,7 @@ fn eat_reg_exp_unicode_surrogate_pair_escape(&mut self) -> bool { * ``` * @returns `true` if it ate the next characters successfully. */ -fn eat_reg_exp_unicode_code_point_escape(&mut self) -> bool { +fn eat_reg_exp_unicode_code_point_escape<'a>(parser: &mut Parser<'a>) -> bool { let start = self.index; if self.eat(LEFT_CURLY_BRACKET) @@ -1517,7 +1520,7 @@ fn eat_reg_exp_unicode_code_point_escape(&mut self) -> bool { * ``` * @returns `true` if it ate the next characters successfully. */ -fn eat_identity_escape(&mut self) -> bool { +fn eat_identity_escape<'a>(parser: &mut Parser<'a>) -> bool { let cp = self.current_code_point; if self.is_valid_identity_escape(cp) { self._last_int_value = cp; @@ -1553,7 +1556,7 @@ fn is_valid_identity_escape(&self, cp: i32) -> bool { * ``` * @returns `true` if it ate the next characters successfully. */ -fn eat_decimal_escape(&mut self) -> bool { +fn eat_decimal_escape<'a>(parser: &mut Parser<'a>) -> bool { self._last_int_value = 0; let mut cp = self.current_code_point; if cp >= DIGIT_ONE && cp <= DIGIT_NINE { From 5a5b523ed438d1ccc16cb1d61fb8dd915511e6cf Mon Sep 17 00:00:00 2001 From: IWANABETHATGUY Date: Sun, 14 Jan 2024 22:32:20 +0800 Subject: [PATCH 10/19] =?UTF-8?q?chore:=20=F0=9F=A4=96=20sk=20point?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/oxc_js_regex/src/ast.rs | 2 +- crates/oxc_js_regex/src/parser.rs | 78 +++++++++++++++++++------------ 2 files changed, 49 insertions(+), 31 deletions(-) diff --git a/crates/oxc_js_regex/src/ast.rs b/crates/oxc_js_regex/src/ast.rs index ff0f940eb7408..f4e5900b8da32 100644 --- a/crates/oxc_js_regex/src/ast.rs +++ b/crates/oxc_js_regex/src/ast.rs @@ -119,7 +119,7 @@ pub struct Group<'a> { /// The capturing group. /// E.g. `(ab)`, `(?ab)` -#[derive(Debug)] +#[derive(Debug, Default)] pub struct CapturingGroup<'a> { pub span: Span, pub name: Option, diff --git a/crates/oxc_js_regex/src/parser.rs b/crates/oxc_js_regex/src/parser.rs index c61291b9f6a1d..061906951b33d 100644 --- a/crates/oxc_js_regex/src/parser.rs +++ b/crates/oxc_js_regex/src/parser.rs @@ -9,9 +9,10 @@ use oxc_diagnostics::Error; use oxc_span::Span; use crate::ast::{ - Alternative, Assertion, BoundaryAssertion, Branch, Character, EdgeAssertion, EdgeAssertionKind, - Element, LookaheadAssertion, LookaroundAssertion, LookbehindAssertion, Pattern, - QuantifiableElement, Quantifier, RegExpLiteral, WordBoundaryAssertion, + Alternative, Assertion, Backreference, BackreferenceRef, BoundaryAssertion, Branch, + CapturingGroup, Character, EdgeAssertion, EdgeAssertionKind, Element, LookaheadAssertion, + LookaroundAssertion, LookbehindAssertion, Pattern, QuantifiableElement, Quantifier, + RegExpLiteral, WordBoundaryAssertion, }; use crate::ast_builder::AstBuilder; use crate::ecma_version::EcmaVersion; @@ -49,6 +50,7 @@ pub struct Parser<'a> { back_reference_names: HashSet, last_assertion_is_quantifiable: bool, last_range: Range, + last_str_value: Stirng, } #[derive(Default, Copy, Clone)] @@ -77,6 +79,7 @@ impl<'a> Parser<'a> { last_range: 0..0, last_assertion_is_quantifiable: false, builder: AstBuilder::new(allocator), + last_str_value: String::default(), } } @@ -92,6 +95,9 @@ impl<'a> Parser<'a> { false } } + pub fn span_with_start(&self, start: u32) -> Span { + Span::new(start, self.index as u32) + } pub fn eat2(&mut self, first: char, second: char) -> bool { if self.is(first) && self.nth(1) == Some(&second) { @@ -455,62 +461,70 @@ fn parse_atom_escape<'a>(parser: &mut Parser<'a>) -> bool { } } -fn parse_backreference<'a>(parser: &mut Parser<'a>) -> bool { +/// TODO: resolve when pattern leave +fn parse_backreference<'a>(parser: &mut Parser<'a>) -> Option> { let start = parser.index; if parser.eat_decimal_escape() { let n = parser.last_int_value; if n <= parser.num_capturing_parens { - parser.on_backreference(start - 1, parser.index, n); - true + Some(Backreference { + span: Span::new(start as u32, parser.index as u32), + reference: BackreferenceRef::Number(n as usize), + resolved: CapturingGroup::default(), + }) } else { if parser.context.strict || parser.context.unicode_mode { panic!("Invalid escape"); } parser.rewind(start); - true + None } } else { - false + None } } +struct UnicodeSetsConsumeResult { + may_contain_strings: Option, +} + fn consume_character_class_escape<'a>(parser: &mut Parser<'a>) -> Option { let start = parser.index; if parser.eat(LATIN_SMALL_LETTER_D) { parser.last_int_value = -1; parser.on_escape_character_set(start - 1, parser.index, "digit", false); - return Some(UnicodeSetsConsumeResult { may_contain_strings: false }); + return Some(UnicodeSetsConsumeResult { may_contain_strings: None }); } if parser.eat(LATIN_CAPITAL_LETTER_D) { - parser._last_int_value = -1; + parser.last_int_value = -1; parser.on_escape_character_set(start - 1, parser.index, "digit", true); - return Some(UnicodeSetsConsumeResult { may_contain_strings: false }); + return Some(UnicodeSetsConsumeResult { may_contain_strings: None }); } if parser.eat(LATIN_SMALL_LETTER_S) { - parser._last_int_value = -1; + parser.last_int_value = -1; parser.on_escape_character_set(start - 1, parser.index, "space", false); - return Some(UnicodeSetsConsumeResult { may_contain_strings: false }); + return Some(UnicodeSetsConsumeResult { may_contain_strings: None }); } if parser.eat(LATIN_CAPITAL_LETTER_S) { - parser._last_int_value = -1; + parser.last_int_value = -1; parser.on_escape_character_set(start - 1, parser.index, "space", true); - return Some(UnicodeSetsConsumeResult { may_contain_strings: false }); + return Some(UnicodeSetsConsumeResult { may_contain_strings: None }); } if parser.eat(LATIN_SMALL_LETTER_W) { - parser._last_int_value = -1; + parser.last_int_value = -1; parser.on_escape_character_set(start - 1, parser.index, "word", false); - return Some(UnicodeSetsConsumeResult { may_contain_strings: false }); + return Some(UnicodeSetsConsumeResult { may_contain_strings: None }); } if parser.eat(LATIN_CAPITAL_LETTER_W) { - parser._last_int_value = -1; + parser.last_int_value = -1; parser.on_escape_character_set(start - 1, parser.index, "word", true); - return Some(UnicodeSetsConsumeResult { may_contain_strings: false }); + return Some(UnicodeSetsConsumeResult { may_contain_strings: None }); } let mut negate = false; @@ -518,7 +532,7 @@ fn consume_character_class_escape<'a>(parser: &mut Parser<'a>) -> Option= 2018 && (parser.eat(LATIN_SMALL_LETTER_P) || (negate = parser.eat(LATIN_CAPITAL_LETTER_P))) { - parser._last_int_value = -1; + parser.last_int_value = -1; if parser.eat(LEFT_CURLY_BRACKET) { if let Some(result) = parser.eat_unicode_property_value_expression() { if parser.eat(RIGHT_CURLY_BRACKET) { @@ -548,20 +562,24 @@ fn consume_character_class_escape<'a>(parser: &mut Parser<'a>) -> Option(parser: &mut Parser<'a>) -> bool { +fn consume_k_group_name<'a>(parser: &mut Parser<'a>) -> Option> { let start = parser.index; - if parser.eat(LATIN_SMALL_LETTER_K) { + if parser.eat('k') { if parser.eat_group_name() { - let group_name = parser._last_str_value.clone(); - parser._backreference_names.insert(group_name.clone()); - parser.on_backreference(start - 1, parser.index, group_name); - return true; + let group_name: String = parser.last_str_value.clone(); + parser.back_reference_names.insert(group_name.clone()); + return Some(Backreference { + span: parser.span_with_start(start), + reference: BackreferenceRef::Atom(group_name.as_str().into()), + // dummy resolved + resolved: CapturingGroup::default(), + }); } - parser.raise("Invalid named reference"); + panic!("Invalid named reference"); } - false + None } fn consume_character_class<'a>(parser: &mut Parser<'a>) -> Option { @@ -570,7 +588,7 @@ fn consume_character_class<'a>(parser: &mut Parser<'a>) -> Option(parser: &mut Parser<'a>) -> UnicodeSetsConsumeResu // ClassContents[UnicodeMode, UnicodeSetsMode] :: // [empty] // 1. Return false. - return UnicodeSetsConsumeResult { may_contain_strings: false }; + return UnicodeSetsConsumeResult { may_contain_strings: None }; } let result = self.consume_class_set_expression(); From dedd6d69af6f44a31ad396ef7f502f66dc372d36 Mon Sep 17 00:00:00 2001 From: IWANABETHATGUY Date: Sun, 14 Jan 2024 22:38:21 +0800 Subject: [PATCH 11/19] =?UTF-8?q?feat:=20=F0=9F=8E=B8=20oct?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/oxc_js_regex/src/parser.rs | 46 +++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/crates/oxc_js_regex/src/parser.rs b/crates/oxc_js_regex/src/parser.rs index 061906951b33d..ca5eeccbf2dce 100644 --- a/crates/oxc_js_regex/src/parser.rs +++ b/crates/oxc_js_regex/src/parser.rs @@ -1587,3 +1587,49 @@ fn eat_decimal_escape<'a>(parser: &mut Parser<'a>) -> bool { } false } + +/** + * Eat the next characters as a `OctalDigit` production if possible. + * Set `self._last_int_value` if it ate the next characters successfully. + * ``` + * OctalDigit:: one of + * 0 1 2 3 4 5 6 7 + * ``` + * @returns `true` if it ate the next characters successfully. + */ +fn eat_octal_digit<'a>(parser: &mut Parser<'a>) -> Option<()> { + let cp = parser.current()?; + if cp.is_digit(8) { + parser.advance(); + parser.last_int_value = cp.to_digit(8)?; + Some(()) + } else { + parser.last_int_value = 0; + None + } +} + +/** + * Eat the next characters as the given number of `HexDigit` productions if + * possible. + * Set `self._last_int_value` if it ate the next characters successfully. + * ``` + * HexDigit:: one of + * 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F + * ``` + * @returns `true` if it ate the next characters successfully. + */ +fn eat_fixed_hex_digits<'a>(parser: &mut Parser<'a>, length: usize) -> Option<()> { + let start = parser.index; + parser.last_int_value = 0; + for _ in 0..length { + let cp = parser.current()?; + if !cp.is_ascii_hexdigit() { + parser.rewind(start); + return None; + } + parser.last_int_value = 16 * parser.last_int_value + cp.to_digit(16)? as usize; + parser.advance(); + } + Some(()) +} From 970d9de4ae827628f5a198c26194d2e983036ac1 Mon Sep 17 00:00:00 2001 From: IWANABETHATGUY Date: Sun, 14 Jan 2024 23:16:34 +0800 Subject: [PATCH 12/19] =?UTF-8?q?chore:=20=F0=9F=A4=96=20ck=20point?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Cargo.lock | 2 + crates/oxc_js_regex/Cargo.toml | 6 +- crates/oxc_js_regex/src/lib.rs | 1 + crates/oxc_js_regex/src/parser.rs | 147 ++++++++++++++---------------- crates/oxc_js_regex/src/util.rs | 7 ++ 5 files changed, 81 insertions(+), 82 deletions(-) create mode 100644 crates/oxc_js_regex/src/util.rs diff --git a/Cargo.lock b/Cargo.lock index b054d73d70750..1d13b2a27b61d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1602,6 +1602,8 @@ dependencies = [ "oxc_allocator", "oxc_diagnostics", "oxc_span", + "oxc_syntax", + "phf", ] [[package]] diff --git a/crates/oxc_js_regex/Cargo.toml b/crates/oxc_js_regex/Cargo.toml index 3f65315f2e836..e473738bc7bbe 100644 --- a/crates/oxc_js_regex/Cargo.toml +++ b/crates/oxc_js_regex/Cargo.toml @@ -19,6 +19,8 @@ workspace = true doctest = false [dependencies] -oxc_allocator = { workspace = true } -oxc_span = { workspace = true } +phf = { workspace = true } +oxc_allocator = { workspace = true } +oxc_span = { workspace = true } oxc_diagnostics = { workspace = true } +oxc_syntax.workspace = true diff --git a/crates/oxc_js_regex/src/lib.rs b/crates/oxc_js_regex/src/lib.rs index 515c301327a72..5062aac9f87e6 100644 --- a/crates/oxc_js_regex/src/lib.rs +++ b/crates/oxc_js_regex/src/lib.rs @@ -4,5 +4,6 @@ mod ast_kind; mod ecma_version; mod lexer; pub mod parser; +mod util; pub mod validator; pub mod visitor; diff --git a/crates/oxc_js_regex/src/parser.rs b/crates/oxc_js_regex/src/parser.rs index ca5eeccbf2dce..2de56395648e8 100644 --- a/crates/oxc_js_regex/src/parser.rs +++ b/crates/oxc_js_regex/src/parser.rs @@ -7,6 +7,7 @@ use std::str::{CharIndices, Chars, Matches}; use oxc_diagnostics::Error; use oxc_span::Span; +use oxc_syntax::unicode_id_start::is_id_continue; use crate::ast::{ Alternative, Assertion, Backreference, BackreferenceRef, BoundaryAssertion, Branch, @@ -16,6 +17,7 @@ use crate::ast::{ }; use crate::ast_builder::AstBuilder; use crate::ecma_version::EcmaVersion; +use crate::util::is_syntax_character; pub struct Lexer<'a> { source: &'a str, @@ -873,7 +875,7 @@ fn consume_class_union_right<'a>( // 1. If MayContainStrings of the ClassSetOperand is true, return true. // 2. If ClassUnion is present, return MayContainStrings of the ClassUnion. // 3. Return false. - return UnicodeSetsConsumeResult { may_contain_strings }; + return UnicodeSetsConsumeResult { may_contain_strings: Some(may_contain_strings) }; } fn eat_decimal_digits<'a>(parser: &mut Parser<'a>) -> bool { @@ -889,6 +891,22 @@ fn eat_decimal_digits<'a>(parser: &mut Parser<'a>) -> bool { parser.index != start } +fn eat_hex_digits(parser: &mut Parser<'a>) -> bool { + let start = parser.index; + parser.last_int_value = 0; + + while let Some(ch) = parser.current() { + if !ch.is_ascii_hexdigit() { + break; + } + parser.last_int_value = + 16 * parser.last_int_value + ch.to_digit(16).expect("should convert successfully"); + parser.advance(); + } + + parser.index != start +} + fn count_capturing_parens<'a>(parser: &mut Parser<'a>) -> usize { let start = parser.index; let mut in_class = false; @@ -1346,48 +1364,6 @@ fn eat_reg_exp_unicode_code_point_escape(parser: &mut Parser<'a>) -> bool { false } -/** - * Eat the next characters as a RegExp `IdentityEscape` production if - * possible. - * Set `self._last_int_value` if it ate the next characters successfully. - * ``` - * IdentityEscape[UnicodeMode, N]:: - * [+UnicodeMode] SyntaxCharacter - * [+UnicodeMode] `/` - * [strict][~UnicodeMode] SourceCharacter but not UnicodeIDContinue - * [annexB][~UnicodeMode] SourceCharacterIdentityEscape[?N] - * SourceCharacterIdentityEscape[N]:: - * [~N] SourceCharacter but not c - * [+N] SourceCharacter but not one of c k - * ``` - * @returns `true` if it ate the next characters successfully. - */ -fn eat_identity_escape<'a>(parser: &mut Parser<'a>) -> bool { - let cp = self.current_code_point; - if self.is_valid_identity_escape(cp) { - self._last_int_value = cp; - self.advance(); - return true; - } - false -} - -fn is_valid_identity_escape<'a>(parser: &mut Parser<'a>, cp: i32) -> bool { - if cp == -1 { - return false; - } - if self._unicode_mode { - return is_syntax_character(cp) || cp == SOLIDUS; - } - if self.strict { - return !is_id_continue(cp); - } - if self._n_flag { - return !(cp == LATIN_SMALL_LETTER_C || cp == LATIN_SMALL_LETTER_K); - } - cp != LATIN_SMALL_LETTER_C -} - /** * Eat the next characters as a RegExp `DecimalEscape` production if * possible. @@ -1423,14 +1399,14 @@ fn eat_decimal_escape<'a>(parser: &mut Parser<'a>) -> bool { * ``` * @returns `true` if it ate the next characters successfully. */ -fn eat_control_letter<'a>(parser: &mut Parser<'a>) -> bool { - let cp = parser.current(); - if is_latin_letter(cp) { +fn eat_control_letter<'a>(parser: &mut Parser<'a>) -> Option<()> { + let cp = parser.current()?; + if cp.is_ascii_alphabetic() { parser.advance(); - parser.last_int_value = cp % 0x20; - return true; + parser.last_int_value = (cp as usize) % 0x20; + return Some(()); } - false + None } /** @@ -1508,17 +1484,17 @@ fn eat_reg_exp_unicode_surrogate_pair_escape<'a>(parser: &mut Parser<'a>) -> boo * @returns `true` if it ate the next characters successfully. */ fn eat_reg_exp_unicode_code_point_escape<'a>(parser: &mut Parser<'a>) -> bool { - let start = self.index; + let start = parser.index; - if self.eat(LEFT_CURLY_BRACKET) - && self.eat_hex_digits() - && self.eat(RIGHT_CURLY_BRACKET) - && is_valid_unicode(self._last_int_value) + if parser.eat('{') + && eat_hex_digits(parser) + && parser.eat('}') + && is_valid_unicode(parser.last_int_value) { return true; } - self.rewind(start); + parser.rewind(start); false } @@ -1538,30 +1514,31 @@ fn eat_reg_exp_unicode_code_point_escape<'a>(parser: &mut Parser<'a>) -> bool { * ``` * @returns `true` if it ate the next characters successfully. */ -fn eat_identity_escape<'a>(parser: &mut Parser<'a>) -> bool { - let cp = self.current_code_point; - if self.is_valid_identity_escape(cp) { - self._last_int_value = cp; - self.advance(); +fn eat_identity_escape<'a>(parser: &mut Parser<'a>) -> Option<()> { + let cp = parser.current(); + if parser.is_valid_identity_escape(cp.cloned()) { + parser.last_int_value = cp.unwrap() as usize; + parser.advance(); return true; } - false + None } -fn is_valid_identity_escape(&self, cp: i32) -> bool { - if cp == -1 { +fn is_valid_identity_escape(parser: &mut Parser<'a>, cp: Option) -> bool { + if cp.is_none() { return false; } - if self._unicode_mode { - return is_syntax_character(cp) || cp == SOLIDUS; + let cp = cp.unwrap(); + if parser.context.unicode_mode { + return is_syntax_character(cp) || cp == '/'; } - if self.strict { + if parser.context.strict { return !is_id_continue(cp); } - if self._n_flag { - return !(cp == LATIN_SMALL_LETTER_C || cp == LATIN_SMALL_LETTER_K); + if parser.context.nflag { + return !(cp == 'c' || cp == 'k'); } - cp != LATIN_SMALL_LETTER_C + cp != 'c' } /** @@ -1574,18 +1551,21 @@ fn is_valid_identity_escape(&self, cp: i32) -> bool { * ``` * @returns `true` if it ate the next characters successfully. */ -fn eat_decimal_escape<'a>(parser: &mut Parser<'a>) -> bool { - self._last_int_value = 0; - let mut cp = self.current_code_point; - if cp >= DIGIT_ONE && cp <= DIGIT_NINE { - while cp >= DIGIT_ZERO && cp <= DIGIT_NINE { - self._last_int_value = 10 * self._last_int_value + (cp - DIGIT_ZERO); - self.advance(); - cp = self.current_code_point; +fn eat_decimal_escape<'a>(parser: &mut Parser<'a>) -> Option<()> { + parser.last_int_value = 0; + let mut cp = parser.current()?; + if cp.is_ascii_digit() { + while cp.is_ascii_digit() { + parser.last_int_value = 10 * parser.last_int_value + cp.to_digit(10)?; + parser.advance(); + cp = match parser.current() { + Some(char) => char, + None => break, + }; } - return true; + return Some(()); } - false + None } /** @@ -1601,7 +1581,7 @@ fn eat_octal_digit<'a>(parser: &mut Parser<'a>) -> Option<()> { let cp = parser.current()?; if cp.is_digit(8) { parser.advance(); - parser.last_int_value = cp.to_digit(8)?; + parser.last_int_value = cp.to_digit(8)? as usize; Some(()) } else { parser.last_int_value = 0; @@ -1633,3 +1613,10 @@ fn eat_fixed_hex_digits<'a>(parser: &mut Parser<'a>, length: usize) -> Option<() } Some(()) } + +const MIN_CODE_POINT: u32 = 0; +const MAX_CODE_POINT: u32 = 0x10FFFF; + +fn is_valid_unicode(code: u32) -> bool { + code >= MIN_CODE_POINT && code <= MAX_CODE_POINT +} diff --git a/crates/oxc_js_regex/src/util.rs b/crates/oxc_js_regex/src/util.rs new file mode 100644 index 0000000000000..c89e651da867e --- /dev/null +++ b/crates/oxc_js_regex/src/util.rs @@ -0,0 +1,7 @@ +use phf::phf_set; + +const SYNTAX_CHARACTERS: phf::Set = phf_set!['(', ')', '[', ']', '{', '}', '|', '-']; +#[inline] +pub fn is_syntax_character(cp: char) -> bool { + SYNTAX_CHARACTERS.contains(&cp) +} From 408e3026d9ec0573f425569d84f95cafe2af4647 Mon Sep 17 00:00:00 2001 From: IWANABETHATGUY Date: Sun, 14 Jan 2024 23:20:06 +0800 Subject: [PATCH 13/19] =?UTF-8?q?chore:=20=F0=9F=A4=96=20ck=20point?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/oxc_js_regex/src/parser.rs | 26 ++++++++++++++------------ crates/oxc_js_regex/src/util.rs | 12 ++++++++++++ 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/crates/oxc_js_regex/src/parser.rs b/crates/oxc_js_regex/src/parser.rs index 2de56395648e8..8cd5c42c3362f 100644 --- a/crates/oxc_js_regex/src/parser.rs +++ b/crates/oxc_js_regex/src/parser.rs @@ -17,7 +17,9 @@ use crate::ast::{ }; use crate::ast_builder::AstBuilder; use crate::ecma_version::EcmaVersion; -use crate::util::is_syntax_character; +use crate::util::{ + combine_surrogate_pair, is_lead_surrogate, is_syntax_character, is_trail_surrogate, +}; pub struct Lexer<'a> { source: &'a str, @@ -1453,23 +1455,23 @@ fn eat_reg_exp_unicode_escape_sequence<'a>(parser: &mut Parser<'a>, force_u_flag * @returns `true` if it ate the next characters successfully. */ fn eat_reg_exp_unicode_surrogate_pair_escape<'a>(parser: &mut Parser<'a>) -> bool { - let start = self.index; + let start = parser.index; - if self.eat_fixed_hex_digits(4) { - let lead = self._last_int_value; - if is_lead_surrogate(lead) - && self.eat(REVERSE_SOLIDUS) - && self.eat(LATIN_SMALL_LETTER_U) - && self.eat_fixed_hex_digits(4) + if parser.eat_fixed_hex_digits(4) { + let lead = parser.last_int_value; + if is_lead_surrogate(lead as u32) + && parser.eat('\\') + && parser.eat('u') + && parser.eat_fixed_hex_digits(4) { - let trail = self._last_int_value; - if is_trail_surrogate(trail) { - self._last_int_value = combine_surrogate_pair(lead, trail); + let trail = parser.last_int_value; + if is_trail_surrogate(trail as u32) { + parser.last_int_value = combine_surrogate_pair(lead, trail) as usize; return true; } } - self.rewind(start); + parser.rewind(start); } false diff --git a/crates/oxc_js_regex/src/util.rs b/crates/oxc_js_regex/src/util.rs index c89e651da867e..9abd39e754b66 100644 --- a/crates/oxc_js_regex/src/util.rs +++ b/crates/oxc_js_regex/src/util.rs @@ -5,3 +5,15 @@ const SYNTAX_CHARACTERS: phf::Set = phf_set!['(', ')', '[', ']', '{', '}', pub fn is_syntax_character(cp: char) -> bool { SYNTAX_CHARACTERS.contains(&cp) } + +pub fn is_lead_surrogate(code: char) -> bool { + code >= 0xd800 && code <= 0xdbff +} + +pub fn is_trail_surrogate(code: u32) -> bool { + code >= 0xdc00 && code <= 0xdfff +} + +pub fn combine_surrogate_pair(lead: u32, trail: u32) -> u32 { + (lead - 0xd800) * 0x400 + (trail - 0xdc00) + 0x10000 +} From 907d7c04c71b24b9f148ef30ef396b613741debd Mon Sep 17 00:00:00 2001 From: IWANABETHATGUY Date: Sun, 14 Jan 2024 23:24:49 +0800 Subject: [PATCH 14/19] =?UTF-8?q?chore:=20=F0=9F=A4=96=20ck=20point?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/oxc_js_regex/src/parser.rs | 38 +++++++++++++++++-------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/crates/oxc_js_regex/src/parser.rs b/crates/oxc_js_regex/src/parser.rs index 8cd5c42c3362f..212c87cd720f2 100644 --- a/crates/oxc_js_regex/src/parser.rs +++ b/crates/oxc_js_regex/src/parser.rs @@ -1376,18 +1376,22 @@ fn eat_reg_exp_unicode_code_point_escape(parser: &mut Parser<'a>) -> bool { * ``` * @returns `true` if it ate the next characters successfully. */ -fn eat_decimal_escape<'a>(parser: &mut Parser<'a>) -> bool { +fn eat_decimal_escape<'a>(parser: &mut Parser<'a>) -> Option<()> { parser.last_int_value = 0; - let mut cp = parser.current(); - if cp >= Some(&'1') && cp <= Some(&'9') { - while cp >= Some(&'1') && cp <= Some(&'9') { - parser.last_int_value = 10 * parser.last_int_value + (cp - DIGIT_ZERO); + let mut cp = parser.current()?; + if cp >= &'1' && cp <= &'9' { + while cp >= &'1' && cp <= &'9' { + parser.last_int_value = 10 * parser.last_int_value + + cp.to_digit(10).expect("should convert successfully") as usize; parser.advance(); - cp = parser.current(); + cp = match parser.current() { + Some(ch) => ch, + None => break, + }; } - return true; + return Some(()); } - false + None } /** @@ -1427,20 +1431,20 @@ fn eat_control_letter<'a>(parser: &mut Parser<'a>) -> Option<()> { * @returns `true` if it ate the next characters successfully. */ fn eat_reg_exp_unicode_escape_sequence<'a>(parser: &mut Parser<'a>, force_u_flag: bool) -> bool { - let start = self.index; - let u_flag = force_u_flag || self._unicode_mode; + let start = parser.index; + let u_flag = force_u_flag || parser.context.unicode_mode; - if self.eat(LATIN_SMALL_LETTER_U) { - if (u_flag && self.eat_reg_exp_unicode_surrogate_pair_escape()) - || self.eat_fixed_hex_digits(4) - || (u_flag && self.eat_reg_exp_unicode_code_point_escape()) + if parser.eat('u') { + if (u_flag && eat_reg_exp_unicode_surrogate_pair_escape(parser)) + || eat_fixed_hex_digits(parser, 4).is_some() + || (u_flag && eat_reg_exp_unicode_code_point_escape(parser)) { return true; } - if self.strict || u_flag { - self.raise("Invalid unicode escape"); + if parser.context.strict || u_flag { + panic!("Invalid unicode escape"); } - self.rewind(start); + parser.rewind(start); } false From 852691da079ae788e4f13553f66d7a43d945148d Mon Sep 17 00:00:00 2001 From: IWANABETHATGUY Date: Sun, 14 Jan 2024 23:36:04 +0800 Subject: [PATCH 15/19] =?UTF-8?q?chore:=20=F0=9F=A4=96=20control=20eascape?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/oxc_js_regex/src/parser.rs | 144 ++++-------------------------- 1 file changed, 17 insertions(+), 127 deletions(-) diff --git a/crates/oxc_js_regex/src/parser.rs b/crates/oxc_js_regex/src/parser.rs index 212c87cd720f2..8af85e936d56d 100644 --- a/crates/oxc_js_regex/src/parser.rs +++ b/crates/oxc_js_regex/src/parser.rs @@ -1233,116 +1233,29 @@ fn eat_zero<'a>(parser: &mut Parser<'a>) -> bool { * @returns `true` if it ate the next characters successfully. */ fn eat_control_escape<'a>(parser: &mut Parser<'a>) -> bool { - if self.eat(LATIN_SMALL_LETTER_F) { - self._last_int_value = FORM_FEED; + if parser.eat('f') { + parser.last_int_value = 12; return true; } - if self.eat(LATIN_SMALL_LETTER_N) { - self._last_int_value = LINE_FEED; + if parser.eat('n') { + parser.last_int_value = 10; return true; } - if self.eat(LATIN_SMALL_LETTER_R) { - self._last_int_value = CARRIAGE_RETURN; + if parser.eat('r') { + parser.last_int_value = 13; return true; } - if self.eat(LATIN_SMALL_LETTER_T) { - self._last_int_value = CHARACTER_TABULATION; + if parser.eat('t') { + parser.last_int_value = 9; return true; } - if self.eat(LATIN_SMALL_LETTER_V) { - self._last_int_value = LINE_TABULATION; + if parser.eat('v') { + parser.last_int_value = 11; return true; } false } -/** - * Eat the next characters as a RegExp `ControlLetter` production if - * possible. - * Set `self._last_int_value` if it ate the next characters successfully. - * ``` - * ControlLetter:: one of - * a b c d e f g h i j k l m n o p q r s t u v w x y z - * A B C D E F G H I J K L M N O P Q R S T U V W X Y Z - * ``` - * @returns `true` if it ate the next characters successfully. - */ -fn eat_control_letter<'a>(parser: &mut Parser<'a>) -> bool { - let cp = self.current_code_point; - if is_latin_letter(cp) { - self.advance(); - self._last_int_value = cp % 0x20; - return true; - } - false -} - -/** - * Eat the next characters as a RegExp `RegExpUnicodeEscapeSequence` - * production if possible. - * Set `self._last_int_value` if it ate the next characters successfully. - * ``` - * RegExpUnicodeEscapeSequence[UnicodeMode]:: - * [+UnicodeMode] `u` HexLeadSurrogate `\u` HexTrailSurrogate - * [+UnicodeMode] `u` HexLeadSurrogate - * [+UnicodeMode] `u` HexTrailSurrogate - * [+UnicodeMode] `u` HexNonSurrogate - * [~UnicodeMode] `u` Hex4Digits - * [+UnicodeMode] `u{` CodePoint `}` - * ``` - * @returns `true` if it ate the next characters successfully. - */ -fn eat_reg_exp_unicode_escape_sequence<'a>(parser: &mut Parser<'a>, force_u_flag: bool) -> bool { - let start = self.index; - let u_flag = force_u_flag || self._unicode_mode; - - if self.eat(LATIN_SMALL_LETTER_U) { - if (u_flag && self.eat_reg_exp_unicode_surrogate_pair_escape()) - || self.eat_fixed_hex_digits(4) - || (u_flag && self.eat_reg_exp_unicode_code_point_escape()) - { - return true; - } - if self.strict || u_flag { - self.raise("Invalid unicode escape"); - } - self.rewind(start); - } - - false -} - -/** - * Eat the next characters as the following alternatives if possible. - * Set `self._last_int_value` if it ate the next characters successfully. - * ``` - * HexLeadSurrogate `\u` HexTrailSurrogate - * ``` - * @returns `true` if it ate the next characters successfully. - */ -fn eat_reg_exp_unicode_surrogate_pair_escape<'a>(parser: &mut Parser<'a>) -> bool { - let start = self.index; - - if self.eat_fixed_hex_digits(4) { - let lead = self._last_int_value; - if is_lead_surrogate(lead) - && self.eat(REVERSE_SOLIDUS) - && self.eat(LATIN_SMALL_LETTER_U) - && self.eat_fixed_hex_digits(4) - { - let trail = self._last_int_value; - if is_trail_surrogate(trail) { - self._last_int_value = combine_surrogate_pair(lead, trail); - return true; - } - } - - self.rewind(start); - } - - false -} - /** * Eat the next characters as the following alternatives if possible. * Set `self._last_int_value` if it ate the next characters successfully. @@ -1351,18 +1264,18 @@ fn eat_reg_exp_unicode_surrogate_pair_escape<'a>(parser: &mut Parser<'a>) -> boo * ``` * @returns `true` if it ate the next characters successfully. */ -fn eat_reg_exp_unicode_code_point_escape(parser: &mut Parser<'a>) -> bool { - let start = self.index; +fn eat_reg_exp_unicode_code_point_escape<'a>(parser: &mut Parser<'a>) -> bool { + let start = parser.index; - if self.eat(LEFT_CURLY_BRACKET) - && self.eat_hex_digits() - && self.eat(RIGHT_CURLY_BRACKET) - && is_valid_unicode(self._last_int_value) + if parser.eat('{') + && eat_hex_digits(parser) + && parser.eat('}') + && is_valid_unicode(parser.last_int_value as u32) { return true; } - self.rewind(start); + parser.rewind(start); false } @@ -1481,29 +1394,6 @@ fn eat_reg_exp_unicode_surrogate_pair_escape<'a>(parser: &mut Parser<'a>) -> boo false } -/** - * Eat the next characters as the following alternatives if possible. - * Set `self._last_int_value` if it ate the next characters successfully. - * ``` - * `{` CodePoint `}` - * ``` - * @returns `true` if it ate the next characters successfully. - */ -fn eat_reg_exp_unicode_code_point_escape<'a>(parser: &mut Parser<'a>) -> bool { - let start = parser.index; - - if parser.eat('{') - && eat_hex_digits(parser) - && parser.eat('}') - && is_valid_unicode(parser.last_int_value) - { - return true; - } - - parser.rewind(start); - false -} - /** * Eat the next characters as a RegExp `IdentityEscape` production if * possible. From 4a39f2e7d2b480173e3c9096867de5a5aa5ab652 Mon Sep 17 00:00:00 2001 From: IWANABETHATGUY Date: Sun, 14 Jan 2024 23:49:08 +0800 Subject: [PATCH 16/19] =?UTF-8?q?chore:=20=F0=9F=A4=96=20ck=20point?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/oxc_js_regex/src/parser.rs | 93 ++++++++++++++++--------------- crates/oxc_js_regex/src/util.rs | 2 +- 2 files changed, 50 insertions(+), 45 deletions(-) diff --git a/crates/oxc_js_regex/src/parser.rs b/crates/oxc_js_regex/src/parser.rs index 8af85e936d56d..b4fbb6dfc46be 100644 --- a/crates/oxc_js_regex/src/parser.rs +++ b/crates/oxc_js_regex/src/parser.rs @@ -7,6 +7,7 @@ use std::str::{CharIndices, Chars, Matches}; use oxc_diagnostics::Error; use oxc_span::Span; +use oxc_syntax::identifier::is_identifier_part; use oxc_syntax::unicode_id_start::is_id_continue; use crate::ast::{ @@ -130,8 +131,8 @@ impl<'a> Parser<'a> { self.lexer.chars.get(self.index + range.start..(self.index + range.end)) } - pub fn current(&self) -> Option<&char> { - self.lexer.chars.get(self.index) + pub fn current(&self) -> Option { + self.lexer.chars.get(self.index).copied() } pub fn advance(&mut self) -> bool { @@ -143,7 +144,7 @@ impl<'a> Parser<'a> { } } - pub fn rewind<'a>(parser: &mut Parser<'a>, start: usize) { + pub fn rewind<'a>(&mut self, start: usize) { self.index = start; } } @@ -1127,26 +1128,28 @@ fn eat_reg_exp_identifier_name<'a>(parser: &mut Parser<'a>) -> bool { * Set `self._last_int_value` if the identifier start existed. * @returns `true` if it ate the next characters successfully. */ -fn eat_reg_exp_identifier_start<'a>(parser: &mut Parser<'a>) -> bool { - let start = self.index; - let force_u_flag = !self._unicode_mode && self.ecma_version >= 2020; - let mut cp = self.current_code_point; - self.advance(); - - if cp == REVERSE_SOLIDUS && self.eat_reg_exp_unicode_escape_sequence(force_u_flag) { - cp = self._last_int_value; - } else if force_u_flag && is_lead_surrogate(cp) && is_trail_surrogate(self.current_code_point) { - cp = combine_surrogate_pair(cp, self.current_code_point); - self.advance(); +fn eat_reg_exp_identifier_start<'a>(parser: &mut Parser<'a>) -> Option<()> { + let start = parser.index; + let force_u_flag = + !parser.context.unicode_mode && parser.context.ecma_version >= EcmaVersion::V2020; + let mut cp = *parser.current()?; + parser.advance(); + + if cp == '\\' && eat_reg_exp_unicode_escape_sequence(parser, force_u_flag) { + cp = parser.last_int_value as u32 as char; + } else if force_u_flag && is_lead_surrogate(cp) && is_trail_surrogate(parser.current()? as u32) + { + cp = combine_surrogate_pair(cp, parser.current() as u32); + parser.advance(); } if is_identifier_start_char(cp) { - self._last_int_value = cp; + parser.last_int_value = cp; return true; } - if self.index != start { - self.rewind(start); + if parser.index != start { + parser.rewind(start); } false } @@ -1162,28 +1165,30 @@ fn eat_reg_exp_identifier_start<'a>(parser: &mut Parser<'a>) -> bool { * ``` * @returns `true` if it ate the next characters successfully. */ -fn eat_reg_exp_identifier_part<'a>(parser: &mut Parser<'a>) -> bool { - let start = self.index; - let force_u_flag = !self._unicode_mode && self.ecma_version >= 2020; - let mut cp = self.current_code_point; - self.advance(); - - if cp == REVERSE_SOLIDUS && self.eat_reg_exp_unicode_escape_sequence(force_u_flag) { - cp = self._last_int_value; - } else if force_u_flag && is_lead_surrogate(cp) && is_trail_surrogate(self.current_code_point) { - cp = combine_surrogate_pair(cp, self.current_code_point); - self.advance(); +fn eat_reg_exp_identifier_part<'a>(parser: &mut Parser<'a>) -> Option<()> { + let start = parser.index; + let force_u_flag = + !parser.context.unicode_mode && parser.context.ecma_version >= EcmaVersion::V2020; + let mut cp = *parser.current()?; + parser.advance(); + + if cp == '\\' && eat_reg_exp_unicode_escape_sequence(parser, force_u_flag) { + cp = parser.last_int_value as u32 as char; + } else if force_u_flag && is_lead_surrogate(cp) && is_trail_surrogate(parser.current()? as u32) + { + cp = combine_surrogate_pair(cp, parser.current()? as u32); + parser.advance(); } - if is_identifier_part_char(cp) { - self._last_int_value = cp; - return true; + if is_identifier_part(cp) { + parser.last_int_value = cp as usize; + return Some(()); } - if self.index != start { - self.rewind(start); + if parser.index != start { + parser.rewind(start); } - false + None } /** @@ -1195,12 +1200,12 @@ fn eat_reg_exp_identifier_part<'a>(parser: &mut Parser<'a>) -> bool { * @returns `true` if it ate the next characters successfully. */ fn eat_c_control_letter<'a>(parser: &mut Parser<'a>) -> bool { - let start = self.index; - if self.eat(LATIN_SMALL_LETTER_C) { - if self.eat_control_letter() { + let start = parser.index; + if parser.eat('c') { + if eat_control_letter(parser).is_some() { return true; } - self.rewind(start); + parser.rewind(start); } false } @@ -1213,13 +1218,13 @@ fn eat_c_control_letter<'a>(parser: &mut Parser<'a>) -> bool { * ``` * @returns `true` if it ate the next characters successfully. */ -fn eat_zero<'a>(parser: &mut Parser<'a>) -> bool { - if self.current_code_point == DIGIT_ZERO && !is_decimal_digit(self.next_code_point) { - self._last_int_value = 0; - self.advance(); - return true; +fn eat_zero<'a>(parser: &mut Parser<'a>) -> Option<()> { + if parser.current()? == '0' && parser.nth(1).map(|ch| ch.is_ascii_digit()) == Some(false) { + parser.last_int_value = 0; + parser.advance(); + return Some(()); } - false + None } /** diff --git a/crates/oxc_js_regex/src/util.rs b/crates/oxc_js_regex/src/util.rs index 9abd39e754b66..d30f37eebc99e 100644 --- a/crates/oxc_js_regex/src/util.rs +++ b/crates/oxc_js_regex/src/util.rs @@ -6,7 +6,7 @@ pub fn is_syntax_character(cp: char) -> bool { SYNTAX_CHARACTERS.contains(&cp) } -pub fn is_lead_surrogate(code: char) -> bool { +pub fn is_lead_surrogate(code: u32) -> bool { code >= 0xd800 && code <= 0xdbff } From 0c5e0ff7bb57b4c3bf5f74781a2a976902631409 Mon Sep 17 00:00:00 2001 From: IWANABETHATGUY Date: Mon, 15 Jan 2024 00:15:10 +0800 Subject: [PATCH 17/19] =?UTF-8?q?feat:=20=F0=9F=8E=B8=20consume=5Fclass=5F?= =?UTF-8?q?set=5Fcharacter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ' | 46 +++++++++++++++++++ crates/oxc_js_regex/src/parser.rs | 74 +++++++++++++++++-------------- crates/oxc_js_regex/src/util.rs | 65 +++++++++++++++++++++++++++ 3 files changed, 151 insertions(+), 34 deletions(-) create mode 100644 ' diff --git a/' b/' new file mode 100644 index 0000000000000..53a6359f76054 --- /dev/null +++ b/' @@ -0,0 +1,46 @@ +use phf::phf_set; + +const SYNTAX_CHARACTERS: phf::Set = phf_set!['(', ')', '[', ']', '{', '}', '|', '-']; + +const CLASS_SET_RESERVED_DOUBLE_PUNCTUATOR_CHARACTER: phf::Set = phf_set! { + '&' => AMPERSAND, + '!' => EXCLAMATION_MARK, + '#' => NUMBER_SIGN, + '$' => DOLLAR_SIGN, + '%' => PERCENT_SIGN, + '*' => ASTERISK, + '+' => PLUS_SIGN, + ',' => COMMA, + '.' => FULL_STOP, + ':' => COLON, + ';' => SEMICOLON, + '<' => LESS_THAN_SIGN, + '=' => EQUALS_SIGN, + '>' => GREATER_THAN_SIGN, + '?' => QUESTION_MARK, + '@' => COMMERCIAL_AT, + '^' => CIRCUMFLEX_ACCENT, + '`' => GRAVE_ACCENT, + '~' => TILDE, +}; + +#[inline] +pub fn is_syntax_character(cp: char) -> bool { + SYNTAX_CHARACTERS.contains(&cp) +} + +pub fn is_lead_surrogate(code: u32) -> bool { + code >= 0xd800 && code <= 0xdbff +} + +pub fn is_trail_surrogate(code: u32) -> bool { + code >= 0xdc00 && code <= 0xdfff +} + +pub fn combine_surrogate_pair(lead: u32, trail: u32) -> u32 { + (lead - 0xd800) * 0x400 + (trail - 0xdc00) + 0x10000 +} + +pub fn is_class_set_reserved_double_punctuator_character(cp: char) -> bool { + CLASS_SET_RESERVED_DOUBLE_PUNCTUATOR_CHARACTER.contains(&cp) +} diff --git a/crates/oxc_js_regex/src/parser.rs b/crates/oxc_js_regex/src/parser.rs index b4fbb6dfc46be..18b24aeb93ae7 100644 --- a/crates/oxc_js_regex/src/parser.rs +++ b/crates/oxc_js_regex/src/parser.rs @@ -19,7 +19,9 @@ use crate::ast::{ use crate::ast_builder::AstBuilder; use crate::ecma_version::EcmaVersion; use crate::util::{ - combine_surrogate_pair, is_lead_surrogate, is_syntax_character, is_trail_surrogate, + combine_surrogate_pair, is_class_set_reserved_double_punctuator_character, + is_class_set_reserved_punctuator, is_class_set_syntax_character, is_lead_surrogate, + is_syntax_character, is_trail_surrogate, }; pub struct Lexer<'a> { @@ -51,7 +53,7 @@ pub struct Parser<'a> { index: usize, group_names: HashSet, num_capturing_parens: usize, - last_int_value: usize, + last_int_value: u32, back_reference_names: HashSet, last_assertion_is_quantifiable: bool, last_range: Range, @@ -123,7 +125,7 @@ impl<'a> Parser<'a> { /// by default next means `nth(1)` pub fn next(&self) -> Option<&char> { - self.lexer.chars.get(self.index + 1) + self.lexer.chars.get(self.index + 1).copied() } /// get a range chars relative from current cursor @@ -1052,40 +1054,44 @@ fn consume_class_string<'a>(parser: &mut Parser<'a>, i: usize) -> UnicodeSetsCon * Set `self._last_int_value` if it consumed the next characters successfully. * @returns `true` if it ate the next characters successfully. */ -fn consume_class_set_character<'a>(parser: &mut Parser<'a>) -> bool { - let start = self.index; - let cp = self.current_code_point; +fn consume_class_set_character<'a>(parser: &mut Parser<'a>) -> Option { + let start = parser.index; + let cp = parser.current()?; - if cp != -1 && cp != self.next_code_point - || !is_class_set_reserved_double_punctuator_character(cp) - { - if cp != -1 && !is_class_set_syntax_character(cp) { - self._last_int_value = cp; - self.advance(); - self.on_character(start, self.index, self._last_int_value); - return true; + if Some(cp) != parser.next() || !is_class_set_reserved_double_punctuator_character(cp) { + if !is_class_set_syntax_character(cp) { + parser.last_int_value = cp as u32; + parser.advance(); + Some(Character { span: parser.span_with_start(start), value: cp }) } } - if self.eat(REVERSE_SOLIDUS) { - if self.consume_character_escape() { + if parser.eat('\\') { + if consume_character_escape() { return true; } - if is_class_set_reserved_punctuator(self.current_code_point) { - self._last_int_value = self.current_code_point; - self.advance(); - self.on_character(start, self.index, self._last_int_value); - return true; + if let Some(ch) = parser.current() + && is_class_set_reserved_punctuator(ch) + { + parser.last_int_value = parser.current()?; + parser.advance(); + + Some(Character { + span: parser.span_with_start(start), + value: parser.last_int_value as char, + }) } - if self.eat(LATIN_SMALL_LETTER_B) { - self._last_int_value = BACKSPACE; - self.on_character(start, self.index, self._last_int_value); - return true; + if parser.eat('b') { + parser.last_int_value = 8; + Some(Character { + span: parser.span_with_start(start), + value: parser.last_int_value as char, + }) } - self.rewind(start); + parser.rewind(start); } - false + None } /** @@ -1094,11 +1100,11 @@ fn consume_class_set_character<'a>(parser: &mut Parser<'a>) -> bool { * @returns `true` if it ate the next characters successfully. */ fn eat_group_name<'a>(parser: &mut Parser<'a>) -> bool { - if self.eat(LESS_THAN_SIGN) { - if self.eat_reg_exp_identifier_name() && self.eat(GREATER_THAN_SIGN) { + if parser.eat('<') { + if eat_reg_exp_identifier_name(parser) && parser.eat('>') { return true; } - self.raise("Invalid capture group name"); + panic!("Invalid capture group name"); } false } @@ -1110,11 +1116,11 @@ fn eat_group_name<'a>(parser: &mut Parser<'a>) -> bool { * @returns `true` if it ate the next characters successfully. */ fn eat_reg_exp_identifier_name<'a>(parser: &mut Parser<'a>) -> bool { - if self.eat_reg_exp_identifier_start() { - self._last_str_value = self._last_int_value.to_string(); + if eat_reg_exp_identifier_start(parser).is_some() { + parser.last_str_value = (parser.last_int_value as char).to_string(); - while self.eat_reg_exp_identifier_part() { - self._last_str_value.push_str(&self._last_int_value.to_string()); + while eat_reg_exp_identifier_part(parser) { + parser.last_str_value.push(parser.last_int_value as char); } return true; diff --git a/crates/oxc_js_regex/src/util.rs b/crates/oxc_js_regex/src/util.rs index d30f37eebc99e..79bea7181163d 100644 --- a/crates/oxc_js_regex/src/util.rs +++ b/crates/oxc_js_regex/src/util.rs @@ -1,6 +1,59 @@ use phf::phf_set; const SYNTAX_CHARACTERS: phf::Set = phf_set!['(', ')', '[', ']', '{', '}', '|', '-']; + +const CLASS_SET_RESERVED_DOUBLE_PUNCTUATOR_CHARACTER: phf::Set = phf_set! { + '&' => AMPERSAND, + '!' => EXCLAMATION_MARK, + '#' => NUMBER_SIGN, + '$' => DOLLAR_SIGN, + '%' => PERCENT_SIGN, + '*' => ASTERISK, + '+' => PLUS_SIGN, + ',' => COMMA, + '.' => FULL_STOP, + ':' => COLON, + ';' => SEMICOLON, + '<' => LESS_THAN_SIGN, + '=' => EQUALS_SIGN, + '>' => GREATER_THAN_SIGN, + '?' => QUESTION_MARK, + '@' => COMMERCIAL_AT, + '^' => CIRCUMFLEX_ACCENT, + '`' => GRAVE_ACCENT, + '~' => TILDE, +}; + +const CLASS_SET_SYNTAX_CHARACTER: phf::Set = phf_set! { + '(' => LEFT_PARENTHESIS, + ')' => RIGHT_PARENTHESIS, + '[' => LEFT_SQUARE_BRACKET, + ']' => RIGHT_SQUARE_BRACKET, + '{' => LEFT_CURLY_BRACKET, + '}' => RIGHT_CURLY_BRACKET, + '/' => SOLIDUS, + '-' => HYPHEN_MINUS, + '\\' => REVERSE_SOLIDUS, + '|' => VERTICAL_LINE, +}; + +const CLASS_SET_RESERVED_PUNCTUATOR: phf::Set = phf_set! { + '&' => AMPERSAND, + '-' => HYPHEN_MINUS, + '!' => EXCLAMATION_MARK, + '#' => NUMBER_SIGN, + '%' => PERCENT_SIGN, + ',' => COMMA, + ':' => COLON, + ';' => SEMICOLON, + '<' => LESS_THAN_SIGN, + '=' => EQUALS_SIGN, + '>' => GREATER_THAN_SIGN, + '@' => COMMERCIAL_AT, + '`' => GRAVE_ACCENT, + '~' => TILDE, +}; + #[inline] pub fn is_syntax_character(cp: char) -> bool { SYNTAX_CHARACTERS.contains(&cp) @@ -17,3 +70,15 @@ pub fn is_trail_surrogate(code: u32) -> bool { pub fn combine_surrogate_pair(lead: u32, trail: u32) -> u32 { (lead - 0xd800) * 0x400 + (trail - 0xdc00) + 0x10000 } + +pub fn is_class_set_reserved_double_punctuator_character(cp: char) -> bool { + CLASS_SET_RESERVED_DOUBLE_PUNCTUATOR_CHARACTER.contains(&cp) +} + +pub fn is_class_set_syntax_character(cp: u32) -> bool { + CLASS_SET_SYNTAX_CHARACTER.contains(&cp) +} + +pub fn is_class_set_reserved_punctuator(cp: u32) -> bool { + CLASS_SET_RESERVED_PUNCTUATOR.contains(&cp) +} From 306be96b2dbd11bb2c8586167a0d1391b660427e Mon Sep 17 00:00:00 2001 From: IWANABETHATGUY Date: Mon, 15 Jan 2024 00:34:22 +0800 Subject: [PATCH 18/19] =?UTF-8?q?feat:=20=F0=9F=8E=B8=20consume=20string?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/oxc_js_regex/src/parser.rs | 84 ++++++++++++++++++++++++++----- crates/oxc_js_regex/src/util.rs | 4 +- 2 files changed, 74 insertions(+), 14 deletions(-) diff --git a/crates/oxc_js_regex/src/parser.rs b/crates/oxc_js_regex/src/parser.rs index 18b24aeb93ae7..724ac3c6fdbcc 100644 --- a/crates/oxc_js_regex/src/parser.rs +++ b/crates/oxc_js_regex/src/parser.rs @@ -14,7 +14,7 @@ use crate::ast::{ Alternative, Assertion, Backreference, BackreferenceRef, BoundaryAssertion, Branch, CapturingGroup, Character, EdgeAssertion, EdgeAssertionKind, Element, LookaheadAssertion, LookaroundAssertion, LookbehindAssertion, Pattern, QuantifiableElement, Quantifier, - RegExpLiteral, WordBoundaryAssertion, + RegExpLiteral, StringAlternative, WordBoundaryAssertion, }; use crate::ast_builder::AstBuilder; use crate::ecma_version::EcmaVersion; @@ -1026,18 +1026,23 @@ fn consume_class_string_disjunction<'a>( * @param i - The index of the string alternative. * @returns `UnicodeSetsConsumeResult`. */ -fn consume_class_string<'a>(parser: &mut Parser<'a>, i: usize) -> UnicodeSetsConsumeResult { - let start = self.index; +fn consume_class_string<'a>( + parser: &mut Parser<'a>, + i: usize, +) -> (UnicodeSetsConsumeResult, Option>) { + let start = parser.index; let mut count = 0; - self.on_string_alternative_enter(start, i); - - while self.current_code_point != -1 && self.consume_class_set_character() { - count += 1; + let mut arr = parser.builder.new_vec(); + while !parser.eof() { + if let Some(character) = consume_class_set_character(parser) { + arr.push(character); + count += 1; + } else { + break; + } } - self.on_string_alternative_leave(start, self.index, i); - // * Static Semantics: MayContainStrings // ClassString :: [empty] // 1. Return true. @@ -1046,7 +1051,10 @@ fn consume_class_string<'a>(parser: &mut Parser<'a>, i: usize) -> UnicodeSetsCon // NonEmptyClassString :: ClassSetCharacter NonEmptyClassString(opt) // 1. If NonEmptyClassString is present, return true. // 2. Return false. - return UnicodeSetsConsumeResult { may_contain_strings: Some(count != 1) }; + ( + UnicodeSetsConsumeResult { may_contain_strings: Some(count != 1) }, + Some(StringAlternative { span: parser.span_with_start(start), elements: arr }), + ) } /** @@ -1067,13 +1075,13 @@ fn consume_class_set_character<'a>(parser: &mut Parser<'a>) -> Option } if parser.eat('\\') { - if consume_character_escape() { + if consume_character_escape(parser) { return true; } if let Some(ch) = parser.current() && is_class_set_reserved_punctuator(ch) { - parser.last_int_value = parser.current()?; + parser.last_int_value = parser.current()? as u32; parser.advance(); Some(Character { @@ -1094,6 +1102,58 @@ fn consume_class_set_character<'a>(parser: &mut Parser<'a>) -> Option None } +fn consume_character_escape<'a>(parser: &mut Parser<'a>) -> Option { + let start = parser.index; + if eat_control_escape(parser) + || eat_c_control_letter(parser) + || eat_zero(parser).is_some() + || eat_hex_escape_sequence(parser) + || eat_reg_exp_unicode_escape_sequence(parser, false) + || (!parser.context.strict + && !parser.context.unicode_mode + && eat_legacy_octal_escape_sequence(parser)) + || eat_identity_escape(parser).is_some() + { + Some(Character { + span: parser.span_with_start(start - 1), + value: parser.last_int_value as char, + }) + } + None +} + +fn eat_hex_escape_sequence<'a>(parser: &mut Parser<'a>) -> bool { + let start = parser.index; + if parser.eat('x') { + if eat_fixed_hex_digits(parser, 2) { + return true; + } + if parser.context.unicode_mode || parser.context.strict { + panic!("Invalid escape"); + } + parser.rewind(start); + } + false +} + +fn eat_legacy_octal_escape_sequence<'a>(parser: &mut Parser<'a>) -> bool { + if eat_octal_digit(parser).is_some() { + let n1 = parser.last_int_value; + if eat_octal_digit(parser).is_some() { + let n2 = parser.last_int_value; + if n1 <= 3 && eat_octal_digit(parser).is_some() { + parser.last_int_value = n1 * 64 + n2 * 8 + parser.last_int_value; + } else { + parser.last_int_value = n1 * 8 + n2; + } + } else { + parser.last_int_value = n1; + } + return true; + } + false +} + /** * Eat the next characters as a RegExp `GroupName` production if possible. * Set `self._last_str_value` if the group name existed. diff --git a/crates/oxc_js_regex/src/util.rs b/crates/oxc_js_regex/src/util.rs index 79bea7181163d..454ea5fb88bc9 100644 --- a/crates/oxc_js_regex/src/util.rs +++ b/crates/oxc_js_regex/src/util.rs @@ -75,10 +75,10 @@ pub fn is_class_set_reserved_double_punctuator_character(cp: char) -> bool { CLASS_SET_RESERVED_DOUBLE_PUNCTUATOR_CHARACTER.contains(&cp) } -pub fn is_class_set_syntax_character(cp: u32) -> bool { +pub fn is_class_set_syntax_character(cp: char) -> bool { CLASS_SET_SYNTAX_CHARACTER.contains(&cp) } -pub fn is_class_set_reserved_punctuator(cp: u32) -> bool { +pub fn is_class_set_reserved_punctuator(cp: char) -> bool { CLASS_SET_RESERVED_PUNCTUATOR.contains(&cp) } From 5bb90f589fdf2b0d2cd713e175b3d7c3b29ccd03 Mon Sep 17 00:00:00 2001 From: IWANABETHATGUY Date: Mon, 15 Jan 2024 01:19:04 +0800 Subject: [PATCH 19/19] =?UTF-8?q?fix:=20=F0=9F=90=9B=20error?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/oxc_js_regex/src/parser.rs | 151 ++++++++++++++++-------------- 1 file changed, 83 insertions(+), 68 deletions(-) diff --git a/crates/oxc_js_regex/src/parser.rs b/crates/oxc_js_regex/src/parser.rs index 724ac3c6fdbcc..bca2c289ec990 100644 --- a/crates/oxc_js_regex/src/parser.rs +++ b/crates/oxc_js_regex/src/parser.rs @@ -12,9 +12,10 @@ use oxc_syntax::unicode_id_start::is_id_continue; use crate::ast::{ Alternative, Assertion, Backreference, BackreferenceRef, BoundaryAssertion, Branch, - CapturingGroup, Character, EdgeAssertion, EdgeAssertionKind, Element, LookaheadAssertion, - LookaroundAssertion, LookbehindAssertion, Pattern, QuantifiableElement, Quantifier, - RegExpLiteral, StringAlternative, WordBoundaryAssertion, + CapturingGroup, Character, CharacterClass, ClassStringDisjunction, EdgeAssertion, + EdgeAssertionKind, Element, LookaheadAssertion, LookaroundAssertion, LookbehindAssertion, + Pattern, QuantifiableElement, Quantifier, RegExpLiteral, StringAlternative, + WordBoundaryAssertion, }; use crate::ast_builder::AstBuilder; use crate::ecma_version::EcmaVersion; @@ -149,6 +150,15 @@ impl<'a> Parser<'a> { pub fn rewind<'a>(&mut self, start: usize) { self.index = start; } + + fn eat3(&self, first: char, second: char, third: char) -> bool { + if self.is(first) && self.nth(1) == Some(&second) && self.nth(2) == Some(&third) { + self.index += 3; + true + } else { + false + } + } } #[derive(Default, Clone, Copy)] @@ -620,13 +630,11 @@ fn consume_character_class<'a>(parser: &mut Parser<'a>) -> Option(parser: &mut Parser<'a>) -> UnicodeSetsConsumeResult { - if self._unicode_sets_mode { - if self.current_code_point == RIGHT_SQUARE_BRACKET { + if parser._unicode_sets_mode { + if parser.current_code_point == RIGHT_SQUARE_BRACKET { // [empty] // * Static Semantics: MayContainStrings @@ -635,7 +643,7 @@ fn consume_class_contents<'a>(parser: &mut Parser<'a>) -> UnicodeSetsConsumeResu // 1. Return false. return UnicodeSetsConsumeResult { may_contain_strings: None }; } - let result = self.consume_class_set_expression(); + let result = parser.consume_class_set_expression(); // * Static Semantics: MayContainStrings // ClassContents :: ClassSetExpression @@ -643,39 +651,39 @@ fn consume_class_contents<'a>(parser: &mut Parser<'a>) -> UnicodeSetsConsumeResu return result; } - let strict = self.strict || self._unicode_mode; + let strict = parser.strict || parser._unicode_mode; loop { // Consume the first ClassAtom - let range_start = self.index; - if !self.consume_class_atom() { + let range_start = parser.index; + if !parser.consume_class_atom() { break; } - let min = self._last_int_value; + let min = parser._last_int_value; // Consume `-` - if !self.eat(HYPHEN_MINUS) { + if !parser.eat(HYPHEN_MINUS) { continue; } - self.on_character(range_start - 1, self.index, HYPHEN_MINUS); + parser.on_character(range_start - 1, parser.index, HYPHEN_MINUS); // Consume the second ClassAtom - if !self.consume_class_atom() { + if !parser.consume_class_atom() { break; } - let max = self._last_int_value; + let max = parser._last_int_value; // Validate if min == -1 || max == -1 { if strict { - self.raise("Invalid character class"); + parser.raise("Invalid character class"); } continue; } if min > max { - self.raise("Range out of order in character class"); + parser.raise("Range out of order in character class"); } - self.on_character_class_range(range_start, self.index, min, max); + parser.on_character_class_range(range_start, self.index, min, max); } // * Static Semantics: MayContainStrings @@ -701,7 +709,7 @@ fn consume_class_atom<'a>(parser: &mut Parser<'a>) -> bool { } if self.eat(REVERSE_SOLIDUS) { - if self.consume_class_escape() { + if consume_class_escape(parser) { return true; } if !self.strict && self.current_code_point == LATIN_SMALL_LETTER_C { @@ -753,7 +761,7 @@ fn consume_class_escape<'a>(parser: &mut Parser<'a>) -> bool { return true; } - return self.consume_character_class_escape() || self.consume_character_escape(); + return consume_character_class_escape(parser) || consume_character_escape(parser); } /** @@ -890,7 +898,7 @@ fn eat_decimal_digits<'a>(parser: &mut Parser<'a>) -> bool { let Some(d) = ch.to_digit(10) else { break; }; - parser.last_int_value = 10 * parser.last_int_value + d as usize; + parser.last_int_value = 10 * parser.last_int_value + d; parser.advance(); } parser.index != start @@ -944,16 +952,15 @@ fn count_capturing_parens<'a>(parser: &mut Parser<'a>) -> usize { count } -/** - * Consume NestedClass in a character class. - * @returns `UnicodeSetsConsumeResult`. - */ -fn consume_nested_class<'a>(parser: &mut Parser<'a>) -> Option { +/// * Consume NestedClass in a character class. +/// * @returns `UnicodeSetsConsumeResult`. +/// TODO: +fn consume_nested_class<'a>(parser: &mut Parser<'a>) -> Option { let start = self.index; if self.eat(LEFT_SQUARE_BRACKET) { let negate = self.eat(CIRCUMFLEX_ACCENT); self.on_character_class_enter(start, negate, true); - let result = self.consume_class_contents(); + let result = consume_class_contents(parser); if !self.eat(RIGHT_SQUARE_BRACKET) { self.raise("Unterminated character class"); } @@ -988,24 +995,27 @@ fn consume_nested_class<'a>(parser: &mut Parser<'a>) -> Option( parser: &mut Parser<'a>, -) -> Option { - let start = self.index; - if self.eat3(REVERSE_SOLIDUS, LATIN_SMALL_LETTER_Q, LEFT_CURLY_BRACKET) { - self.on_class_string_disjunction_enter(start); - +) -> (Option, Option>) { + let start = parser.index; + if parser.eat3('\\', 'q', '{') { let mut i = 0; let mut may_contain_strings = false; - while self.consume_class_string(i).may_contain_strings.unwrap_or(false) { - may_contain_strings = true; + let mut alternatives = parser.builder.new_vec(); + loop { + let (consume_res, node) = consume_class_string(parser, i); + if consume_res.may_contain_strings.unwrap_or_default() { + may_contain_strings = true; + } + if let Some(node) = node { + alternatives.push(node); + } i += 1; - if !self.eat(VERTICAL_LINE) { + if !parser.eat('|') { break; } } - if self.eat(RIGHT_CURLY_BRACKET) { - self.on_class_string_disjunction_leave(start, self.index); - + if parser.eat('}') { // * Static Semantics: MayContainStrings // ClassStringDisjunction :: \q{ ClassStringDisjunctionContents } // 1. Return MayContainStrings of the ClassStringDisjunctionContents. @@ -1014,9 +1024,12 @@ fn consume_class_string_disjunction<'a>( // ClassStringDisjunctionContents :: ClassString | ClassStringDisjunctionContents // 1. If MayContainStrings of the ClassString is true, return true. // 2. Return MayContainStrings of the ClassStringDisjunctionContents. - return Some(UnicodeSetsConsumeResult { may_contain_strings }); + return ( + Some(UnicodeSetsConsumeResult { may_contain_strings: Some(may_contain_strings) }), + Some(ClassStringDisjunction { span: parser.span_with_start(start), alternatives }), + ); } - self.raise("Unterminated class string disjunction"); + panic!("Unterminated class string disjunction"); } None } @@ -1198,20 +1211,20 @@ fn eat_reg_exp_identifier_start<'a>(parser: &mut Parser<'a>) -> Option<()> { let start = parser.index; let force_u_flag = !parser.context.unicode_mode && parser.context.ecma_version >= EcmaVersion::V2020; - let mut cp = *parser.current()?; + let mut cp = parser.current()?; parser.advance(); if cp == '\\' && eat_reg_exp_unicode_escape_sequence(parser, force_u_flag) { - cp = parser.last_int_value as u32 as char; + cp = char::from_u32(parser.last_int_value).expect("should convert to char"); } else if force_u_flag && is_lead_surrogate(cp) && is_trail_surrogate(parser.current()? as u32) { - cp = combine_surrogate_pair(cp, parser.current() as u32); + cp = combine_surrogate_pair(cp, parser.current().expect("should convert to u32") as u32); parser.advance(); } if is_identifier_start_char(cp) { - parser.last_int_value = cp; - return true; + parser.last_int_value = cp as u32; + return Some(()); } if parser.index != start { @@ -1235,19 +1248,21 @@ fn eat_reg_exp_identifier_part<'a>(parser: &mut Parser<'a>) -> Option<()> { let start = parser.index; let force_u_flag = !parser.context.unicode_mode && parser.context.ecma_version >= EcmaVersion::V2020; - let mut cp = *parser.current()?; + let mut cp = parser.current()?; parser.advance(); if cp == '\\' && eat_reg_exp_unicode_escape_sequence(parser, force_u_flag) { - cp = parser.last_int_value as u32 as char; - } else if force_u_flag && is_lead_surrogate(cp) && is_trail_surrogate(parser.current()? as u32) + cp = char::from_u32(parser.last_int_value).expect("should convert to char"); + } else if force_u_flag + && is_lead_surrogate(cp as u32) + && is_trail_surrogate(parser.current()? as u32) { - cp = combine_surrogate_pair(cp, parser.current()? as u32); + cp = combine_surrogate_pair(cp as u32, parser.current()? as u32); parser.advance(); } if is_identifier_part(cp) { - parser.last_int_value = cp as usize; + parser.last_int_value = cp as u32; return Some(()); } @@ -1363,10 +1378,10 @@ fn eat_reg_exp_unicode_code_point_escape<'a>(parser: &mut Parser<'a>) -> bool { fn eat_decimal_escape<'a>(parser: &mut Parser<'a>) -> Option<()> { parser.last_int_value = 0; let mut cp = parser.current()?; - if cp >= &'1' && cp <= &'9' { - while cp >= &'1' && cp <= &'9' { - parser.last_int_value = 10 * parser.last_int_value - + cp.to_digit(10).expect("should convert successfully") as usize; + if cp >= '1' && cp <= '9' { + while cp >= '1' && cp <= '9' { + parser.last_int_value = + 10 * parser.last_int_value + cp.to_digit(10).expect("should convert successfully"); parser.advance(); cp = match parser.current() { Some(ch) => ch, @@ -1393,7 +1408,7 @@ fn eat_control_letter<'a>(parser: &mut Parser<'a>) -> Option<()> { let cp = parser.current()?; if cp.is_ascii_alphabetic() { parser.advance(); - parser.last_int_value = (cp as usize) % 0x20; + parser.last_int_value = (cp as u32) % 0x20; return Some(()); } None @@ -1445,16 +1460,16 @@ fn eat_reg_exp_unicode_escape_sequence<'a>(parser: &mut Parser<'a>, force_u_flag fn eat_reg_exp_unicode_surrogate_pair_escape<'a>(parser: &mut Parser<'a>) -> bool { let start = parser.index; - if parser.eat_fixed_hex_digits(4) { + if eat_fixed_hex_digits(parser, 4).is_some() { let lead = parser.last_int_value; - if is_lead_surrogate(lead as u32) + if is_lead_surrogate(lead) && parser.eat('\\') && parser.eat('u') - && parser.eat_fixed_hex_digits(4) + && eat_fixed_hex_digits(parser, 4).is_some() { let trail = parser.last_int_value; - if is_trail_surrogate(trail as u32) { - parser.last_int_value = combine_surrogate_pair(lead, trail) as usize; + if is_trail_surrogate(trail) { + parser.last_int_value = combine_surrogate_pair(lead, trail); return true; } } @@ -1483,10 +1498,10 @@ fn eat_reg_exp_unicode_surrogate_pair_escape<'a>(parser: &mut Parser<'a>) -> boo */ fn eat_identity_escape<'a>(parser: &mut Parser<'a>) -> Option<()> { let cp = parser.current(); - if parser.is_valid_identity_escape(cp.cloned()) { - parser.last_int_value = cp.unwrap() as usize; + if is_valid_identity_escape(parser, cp) { + parser.last_int_value = cp.unwrap() as u32; parser.advance(); - return true; + return Some(()); } None } @@ -1548,7 +1563,7 @@ fn eat_octal_digit<'a>(parser: &mut Parser<'a>) -> Option<()> { let cp = parser.current()?; if cp.is_digit(8) { parser.advance(); - parser.last_int_value = cp.to_digit(8)? as usize; + parser.last_int_value = cp.to_digit(8)?; Some(()) } else { parser.last_int_value = 0; @@ -1575,7 +1590,7 @@ fn eat_fixed_hex_digits<'a>(parser: &mut Parser<'a>, length: usize) -> Option<() parser.rewind(start); return None; } - parser.last_int_value = 16 * parser.last_int_value + cp.to_digit(16)? as usize; + parser.last_int_value = 16 * parser.last_int_value + cp.to_digit(16)?; parser.advance(); } Some(())