From a2c35e21ea8866f76c385720830c1bbfc4f747b7 Mon Sep 17 00:00:00 2001 From: Yuji Sugiura Date: Mon, 19 May 2025 16:49:08 +0900 Subject: [PATCH 1/3] refactor(regular_expression): Refactor regexp-modifiers support --- Cargo.lock | 1 + crates/oxc_regular_expression/Cargo.toml | 1 + crates/oxc_regular_expression/README.md | 2 +- crates/oxc_regular_expression/src/ast.rs | 29 +++++--- .../src/ast_impl/allocator.rs | 11 +++ .../src/ast_impl/content_eq.rs | 0 .../src/ast_impl/display.rs | 14 ++-- .../src/ast_impl/mod.rs | 1 + .../src/ast_impl/span.rs | 12 +++- .../src/generated/assert_layouts.rs | 22 +++--- .../src/generated/derive_clone_in.rs | 20 ------ .../src/generated/derive_content_eq.rs | 8 --- .../pattern_parser/pattern_parser_impl.rs | 67 ++++++++++--------- napi/parser/generated/deserialize/js.js | 9 +-- napi/parser/generated/deserialize/ts.js | 9 +-- 15 files changed, 99 insertions(+), 107 deletions(-) create mode 100644 crates/oxc_regular_expression/src/ast_impl/allocator.rs create mode 100644 crates/oxc_regular_expression/src/ast_impl/content_eq.rs diff --git a/Cargo.lock b/Cargo.lock index 02e9adc29ccc7..d40efbea11ec4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2018,6 +2018,7 @@ dependencies = [ name = "oxc_regular_expression" version = "0.70.0" dependencies = [ + "bitflags 2.9.1", "oxc_allocator", "oxc_ast_macros", "oxc_diagnostics", diff --git a/crates/oxc_regular_expression/Cargo.toml b/crates/oxc_regular_expression/Cargo.toml index 6f0c33d72b28a..d8ed69f2639d0 100644 --- a/crates/oxc_regular_expression/Cargo.toml +++ b/crates/oxc_regular_expression/Cargo.toml @@ -25,6 +25,7 @@ oxc_ast_macros = { workspace = true } oxc_diagnostics = { workspace = true } oxc_span = { workspace = true } +bitflags = { workspace = true } phf = { workspace = true, features = ["macros"] } rustc-hash = { workspace = true } unicode-id-start = { workspace = true } diff --git a/crates/oxc_regular_expression/README.md b/crates/oxc_regular_expression/README.md index facadc40e9271..d154516dbd7f5 100644 --- a/crates/oxc_regular_expression/README.md +++ b/crates/oxc_regular_expression/README.md @@ -6,7 +6,7 @@ Implements ECMAScript® 2024 Language Specification - https://tc39.es/ecma262/2024/multipage/text-processing.html#sec-regexp-regular-expression-objects - https://tc39.es/ecma262/2024/multipage/additional-ecmascript-features-for-web-browsers.html#sec-regular-expressions-patterns -And, Stage 3 proposals +And, Stage 4 proposals - https://github.com/tc39/proposal-duplicate-named-capturing-groups - https://github.com/tc39/proposal-regexp-modifiers diff --git a/crates/oxc_regular_expression/src/ast.rs b/crates/oxc_regular_expression/src/ast.rs index fb38e7e7e10a2..af0a51e199618 100644 --- a/crates/oxc_regular_expression/src/ast.rs +++ b/crates/oxc_regular_expression/src/ast.rs @@ -1,3 +1,5 @@ +use bitflags::bitflags; + use oxc_allocator::{Box, CloneIn, GetAddress, Vec}; use oxc_ast_macros::ast; use oxc_span::{Atom, ContentEq, Span}; @@ -283,19 +285,26 @@ pub struct IgnoreGroup<'a> { #[generate_derive(CloneIn, ContentEq)] pub struct Modifiers { pub span: Span, - pub enabling: Option, - pub disabling: Option, + pub enabling: Modifier, + pub disabling: Modifier, } -/// Each part of modifier in [`Modifiers`]. -#[ast] -#[derive(Debug)] -#[generate_derive(CloneIn, ContentEq)] -pub struct Modifier { - pub ignore_case: bool, - pub multiline: bool, - pub sticky: bool, +bitflags! { + /// Each part of modifier in [`Modifiers`]. + #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] + pub struct Modifier: u8 { + /// Ignore case flag + const I = 1 << 0; + /// Multiline flag + const M = 1 << 1; + /// DotAll flag + const S = 1 << 2; + } } +/// Dummy type to communicate the content of `Modifier` to `oxc_ast_tools`. +#[ast(foreign = Modifier)] +#[expect(dead_code)] +struct ModifierAlias(u8); /// Backreference by index. /// e.g. `\1`, `\2`, `\3` diff --git a/crates/oxc_regular_expression/src/ast_impl/allocator.rs b/crates/oxc_regular_expression/src/ast_impl/allocator.rs new file mode 100644 index 0000000000000..592a7194f740d --- /dev/null +++ b/crates/oxc_regular_expression/src/ast_impl/allocator.rs @@ -0,0 +1,11 @@ +use oxc_allocator::{Allocator, CloneIn}; + +use crate::ast::Modifier; + +impl<'alloc> CloneIn<'alloc> for Modifier { + type Cloned = Self; + + fn clone_in(&self, _: &'alloc Allocator) -> Self::Cloned { + *self + } +} diff --git a/crates/oxc_regular_expression/src/ast_impl/content_eq.rs b/crates/oxc_regular_expression/src/ast_impl/content_eq.rs new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/crates/oxc_regular_expression/src/ast_impl/display.rs b/crates/oxc_regular_expression/src/ast_impl/display.rs index ee0fb3d476dcc..a2c56973c6ea3 100644 --- a/crates/oxc_regular_expression/src/ast_impl/display.rs +++ b/crates/oxc_regular_expression/src/ast_impl/display.rs @@ -251,13 +251,13 @@ impl Display for CapturingGroup<'_> { impl Display for IgnoreGroup<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fn write_flags(f: &mut fmt::Formatter<'_>, flags: &Modifier) -> fmt::Result { - if flags.ignore_case { + if flags.contains(Modifier::I) { f.write_str("i")?; } - if flags.multiline { + if flags.contains(Modifier::M) { f.write_str("m")?; } - if flags.sticky { + if flags.contains(Modifier::S) { f.write_str("s")?; } Ok(()) @@ -266,12 +266,12 @@ impl Display for IgnoreGroup<'_> { f.write_str("(?")?; if let Some(modifiers) = &self.modifiers { - if let Some(enabling) = &modifiers.enabling { - write_flags(f, enabling)?; + if !modifiers.enabling.is_empty() { + write_flags(f, &modifiers.enabling)?; } - if let Some(disabling) = &modifiers.disabling { + if !modifiers.disabling.is_empty() { f.write_str("-")?; - write_flags(f, disabling)?; + write_flags(f, &modifiers.disabling)?; } } diff --git a/crates/oxc_regular_expression/src/ast_impl/mod.rs b/crates/oxc_regular_expression/src/ast_impl/mod.rs index e3cc281a57648..b95253a8f4977 100644 --- a/crates/oxc_regular_expression/src/ast_impl/mod.rs +++ b/crates/oxc_regular_expression/src/ast_impl/mod.rs @@ -1,3 +1,4 @@ +mod allocator; mod display; mod span; pub mod visit; diff --git a/crates/oxc_regular_expression/src/ast_impl/span.rs b/crates/oxc_regular_expression/src/ast_impl/span.rs index 07aa2a83b07d7..b477e4a9409a9 100644 --- a/crates/oxc_regular_expression/src/ast_impl/span.rs +++ b/crates/oxc_regular_expression/src/ast_impl/span.rs @@ -1,9 +1,9 @@ // NOTE: For now, this file is implemented by hand for convenience. // But like `oxc_ast`, this should be generated by `tasks/ast_tools` in the future. -use oxc_span::{GetSpan, Span}; +use oxc_span::{ContentEq, GetSpan, Span}; -use crate::ast::{CharacterClassContents, Term}; +use crate::ast::{CharacterClassContents, Modifier, Term}; impl GetSpan for Term<'_> { #[inline] @@ -38,3 +38,11 @@ impl GetSpan for CharacterClassContents<'_> { } } } + +// --- + +impl ContentEq for Modifier { + fn content_eq(&self, other: &Self) -> bool { + self == other + } +} diff --git a/crates/oxc_regular_expression/src/generated/assert_layouts.rs b/crates/oxc_regular_expression/src/generated/assert_layouts.rs index ef5de10434a2b..e94f786f73416 100644 --- a/crates/oxc_regular_expression/src/generated/assert_layouts.rs +++ b/crates/oxc_regular_expression/src/generated/assert_layouts.rs @@ -135,25 +135,22 @@ const _: () = { assert!(offset_of!(CapturingGroup, body) == 24); // Padding: 0 bytes - assert!(size_of::() == 56); + assert!(size_of::() == 64); assert!(align_of::() == 8); assert!(offset_of!(IgnoreGroup, span) == 0); assert!(offset_of!(IgnoreGroup, modifiers) == 8); - assert!(offset_of!(IgnoreGroup, body) == 24); + assert!(offset_of!(IgnoreGroup, body) == 32); - // Padding: 2 bytes + // Padding: 6 bytes assert!(size_of::() == 16); assert!(align_of::() == 8); assert!(offset_of!(Modifiers, span) == 0); assert!(offset_of!(Modifiers, enabling) == 8); - assert!(offset_of!(Modifiers, disabling) == 11); + assert!(offset_of!(Modifiers, disabling) == 9); // Padding: 0 bytes - assert!(size_of::() == 3); + assert!(size_of::() == 1); assert!(align_of::() == 1); - assert!(offset_of!(Modifier, ignore_case) == 0); - assert!(offset_of!(Modifier, multiline) == 1); - assert!(offset_of!(Modifier, sticky) == 2); // Padding: 4 bytes assert!(size_of::() == 16); @@ -303,18 +300,15 @@ const _: () = { assert!(offset_of!(IgnoreGroup, body) == 24); // Padding: 2 bytes - assert!(size_of::() == 16); + assert!(size_of::() == 12); assert!(align_of::() == 4); assert!(offset_of!(Modifiers, span) == 0); assert!(offset_of!(Modifiers, enabling) == 8); - assert!(offset_of!(Modifiers, disabling) == 11); + assert!(offset_of!(Modifiers, disabling) == 9); // Padding: 0 bytes - assert!(size_of::() == 3); + assert!(size_of::() == 1); assert!(align_of::() == 1); - assert!(offset_of!(Modifier, ignore_case) == 0); - assert!(offset_of!(Modifier, multiline) == 1); - assert!(offset_of!(Modifier, sticky) == 2); // Padding: 0 bytes assert!(size_of::() == 12); diff --git a/crates/oxc_regular_expression/src/generated/derive_clone_in.rs b/crates/oxc_regular_expression/src/generated/derive_clone_in.rs index 54b00dc744ddf..8093dc715a34a 100644 --- a/crates/oxc_regular_expression/src/generated/derive_clone_in.rs +++ b/crates/oxc_regular_expression/src/generated/derive_clone_in.rs @@ -529,26 +529,6 @@ impl<'new_alloc> CloneIn<'new_alloc> for Modifiers { } } -impl<'new_alloc> CloneIn<'new_alloc> for Modifier { - type Cloned = Modifier; - - fn clone_in(&self, allocator: &'new_alloc Allocator) -> Self::Cloned { - Modifier { - ignore_case: CloneIn::clone_in(&self.ignore_case, allocator), - multiline: CloneIn::clone_in(&self.multiline, allocator), - sticky: CloneIn::clone_in(&self.sticky, allocator), - } - } - - fn clone_in_with_semantic_ids(&self, allocator: &'new_alloc Allocator) -> Self::Cloned { - Modifier { - ignore_case: CloneIn::clone_in_with_semantic_ids(&self.ignore_case, allocator), - multiline: CloneIn::clone_in_with_semantic_ids(&self.multiline, allocator), - sticky: CloneIn::clone_in_with_semantic_ids(&self.sticky, allocator), - } - } -} - impl<'new_alloc> CloneIn<'new_alloc> for IndexedReference { type Cloned = IndexedReference; diff --git a/crates/oxc_regular_expression/src/generated/derive_content_eq.rs b/crates/oxc_regular_expression/src/generated/derive_content_eq.rs index 9f4b5bc340065..918c5e3ddb9fa 100644 --- a/crates/oxc_regular_expression/src/generated/derive_content_eq.rs +++ b/crates/oxc_regular_expression/src/generated/derive_content_eq.rs @@ -189,14 +189,6 @@ impl ContentEq for Modifiers { } } -impl ContentEq for Modifier { - fn content_eq(&self, other: &Self) -> bool { - ContentEq::content_eq(&self.ignore_case, &other.ignore_case) - && ContentEq::content_eq(&self.multiline, &other.multiline) - && ContentEq::content_eq(&self.sticky, &other.sticky) - } -} - impl ContentEq for IndexedReference { fn content_eq(&self, other: &Self) -> bool { ContentEq::content_eq(&self.index, &other.index) diff --git a/crates/oxc_regular_expression/src/parser/pattern_parser/pattern_parser_impl.rs b/crates/oxc_regular_expression/src/parser/pattern_parser/pattern_parser_impl.rs index 2e54b4e92574f..f210d711a2873 100644 --- a/crates/oxc_regular_expression/src/parser/pattern_parser/pattern_parser_impl.rs +++ b/crates/oxc_regular_expression/src/parser/pattern_parser/pattern_parser_impl.rs @@ -1578,22 +1578,31 @@ impl<'a> PatternParser<'a> { fn parse_modifiers(&mut self) -> Result> { let span_start = self.reader.offset(); - // Currently only `[i, m, s]` are supported - let mut enabling_flags = [0, 0, 0]; - let mut disabling_flags = [0, 0, 0]; + let mut enabling = ast::Modifier::empty(); + let mut disabling = ast::Modifier::empty(); + let mut duplicate = false; // Enabling while self.reader.peek().filter(|&cp| cp == ':' as u32 || cp == '-' as u32).is_none() { if self.reader.eat('i') { - enabling_flags[0] += 1; + if enabling.contains(ast::Modifier::I) { + duplicate = true; + } + enabling |= ast::Modifier::I; continue; } if self.reader.eat('m') { - enabling_flags[1] += 1; + if enabling.contains(ast::Modifier::M) { + duplicate = true; + } + enabling |= ast::Modifier::M; continue; } if self.reader.eat('s') { - enabling_flags[2] += 1; + if enabling.contains(ast::Modifier::S) { + duplicate = true; + } + enabling |= ast::Modifier::S; continue; } @@ -1606,15 +1615,24 @@ impl<'a> PatternParser<'a> { if self.reader.eat('-') { while self.reader.peek().filter(|&cp| cp == ':' as u32).is_none() { if self.reader.eat('i') { - disabling_flags[0] += 1; + if disabling.contains(ast::Modifier::I) { + duplicate = true; + } + disabling |= ast::Modifier::I; continue; } if self.reader.eat('m') { - disabling_flags[1] += 1; + if disabling.contains(ast::Modifier::M) { + duplicate = true; + } + disabling |= ast::Modifier::M; continue; } if self.reader.eat('s') { - disabling_flags[2] += 1; + if disabling.contains(ast::Modifier::S) { + duplicate = true; + } + disabling |= ast::Modifier::S; continue; } @@ -1624,23 +1642,18 @@ impl<'a> PatternParser<'a> { } } - let (enabling_iter, disabling_iter) = (enabling_flags.iter(), disabling_flags.iter()); - // [SS:EE] Atom :: (? RegularExpressionModifiers : Disjunction ) // It is a Syntax Error if the source text matched by RegularExpressionModifiers contains the same code point more than once. // [SS:EE] Atom :: (? RegularExpressionModifiers - RegularExpressionModifiers : Disjunction ) + // It is a Syntax Error if the source text matched by the first RegularExpressionModifiers and the source text matched by the second RegularExpressionModifiers are both empty. // It is a Syntax Error if the source text matched by the first RegularExpressionModifiers contains the same code point more than once. - // It is a Syntax Error if the source text matched by the second RegularExpressionModifiers contains the same code point more than once. // It is a Syntax Error if any code point in the source text matched by the first RegularExpressionModifiers is also contained in the source text matched by the second RegularExpressionModifiers. - let flags_iter = enabling_iter.clone().zip(disabling_iter.clone()); - if flags_iter.clone().any(|flags| !matches!(flags, (0 | 1, 0) | (0, 1))) { - return Err(diagnostics::invalid_modifiers( - self.span_factory.create(span_start, self.reader.offset()), - )); - } - // NOTE: Spec is not yet fixed and merged, so these may change: - // https://github.com/tc39/ecma262/pull/3221#pullrequestreview-2341169958 - if flags_iter.clone().all(|flags| matches!(flags, (0, 0))) { + if enabling.is_empty() && disabling.is_empty() + || duplicate + || [ast::Modifier::I, ast::Modifier::M, ast::Modifier::S] + .iter() + .any(|&modifier| enabling.contains(modifier) && disabling.contains(modifier)) + { return Err(diagnostics::invalid_modifiers( self.span_factory.create(span_start, self.reader.offset()), )); @@ -1648,16 +1661,8 @@ impl<'a> PatternParser<'a> { Ok(Some(ast::Modifiers { span: self.span_factory.create(span_start, self.reader.offset()), - enabling: enabling_iter.clone().any(|f| *f == 1).then(|| ast::Modifier { - ignore_case: enabling_flags[0] == 1, - multiline: enabling_flags[1] == 1, - sticky: enabling_flags[2] == 1, - }), - disabling: disabling_iter.clone().any(|f| *f == 1).then(|| ast::Modifier { - ignore_case: disabling_flags[0] == 1, - multiline: disabling_flags[1] == 1, - sticky: disabling_flags[2] == 1, - }), + enabling, + disabling, })) } diff --git a/napi/parser/generated/deserialize/js.js b/napi/parser/generated/deserialize/js.js index 0d594d89b8b38..a48232d9153cf 100644 --- a/napi/parser/generated/deserialize/js.js +++ b/napi/parser/generated/deserialize/js.js @@ -5389,13 +5389,8 @@ function deserializeVecCharacter(pos) { } function deserializeOptionModifiers(pos) { - if (uint8[pos + 8] === 3) return null; - return deserializeModifiers(pos); -} - -function deserializeOptionModifier(pos) { - if (uint8[pos] === 2) return null; - return deserializeModifier(pos); + if (uint8[pos] === 0) return null; + return deserializeModifiers(pos + 8); } function deserializeVecError(pos) { diff --git a/napi/parser/generated/deserialize/ts.js b/napi/parser/generated/deserialize/ts.js index 7547afab6f25c..d88d9ba54cdce 100644 --- a/napi/parser/generated/deserialize/ts.js +++ b/napi/parser/generated/deserialize/ts.js @@ -5541,13 +5541,8 @@ function deserializeVecCharacter(pos) { } function deserializeOptionModifiers(pos) { - if (uint8[pos + 8] === 3) return null; - return deserializeModifiers(pos); -} - -function deserializeOptionModifier(pos) { - if (uint8[pos] === 2) return null; - return deserializeModifier(pos); + if (uint8[pos] === 0) return null; + return deserializeModifiers(pos + 8); } function deserializeVecError(pos) { From 0a06b8ce40728fe14d57085b22e25c2d13a28336 Mon Sep 17 00:00:00 2001 From: Yuji Sugiura Date: Mon, 19 May 2025 16:57:14 +0900 Subject: [PATCH 2/3] Fix lint --- crates/oxc_regular_expression/src/ast_impl/display.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/oxc_regular_expression/src/ast_impl/display.rs b/crates/oxc_regular_expression/src/ast_impl/display.rs index a2c56973c6ea3..c4c48a84f0d06 100644 --- a/crates/oxc_regular_expression/src/ast_impl/display.rs +++ b/crates/oxc_regular_expression/src/ast_impl/display.rs @@ -250,7 +250,7 @@ impl Display for CapturingGroup<'_> { impl Display for IgnoreGroup<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - fn write_flags(f: &mut fmt::Formatter<'_>, flags: &Modifier) -> fmt::Result { + fn write_flags(f: &mut fmt::Formatter<'_>, flags: Modifier) -> fmt::Result { if flags.contains(Modifier::I) { f.write_str("i")?; } @@ -267,11 +267,11 @@ impl Display for IgnoreGroup<'_> { if let Some(modifiers) = &self.modifiers { if !modifiers.enabling.is_empty() { - write_flags(f, &modifiers.enabling)?; + write_flags(f, modifiers.enabling)?; } if !modifiers.disabling.is_empty() { f.write_str("-")?; - write_flags(f, &modifiers.disabling)?; + write_flags(f, modifiers.disabling)?; } } From 21bc19dc05b6c64049ee0dc5665d8ba15f21f286 Mon Sep 17 00:00:00 2001 From: Yuji Sugiura Date: Mon, 19 May 2025 17:18:54 +0900 Subject: [PATCH 3/3] Remove useless file --- crates/oxc_regular_expression/src/ast_impl/content_eq.rs | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 crates/oxc_regular_expression/src/ast_impl/content_eq.rs diff --git a/crates/oxc_regular_expression/src/ast_impl/content_eq.rs b/crates/oxc_regular_expression/src/ast_impl/content_eq.rs deleted file mode 100644 index e69de29bb2d1d..0000000000000