diff --git a/Cargo.lock b/Cargo.lock index 3b6b01085ffe5..b19878dd83d65 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1441,7 +1441,6 @@ dependencies = [ name = "oxc_ast" version = "0.27.0" dependencies = [ - "bitflags 2.6.0", "num-bigint", "oxc_allocator", "oxc_ast_macros", @@ -1779,6 +1778,7 @@ dependencies = [ "oxc_allocator", "oxc_ast", "oxc_parser", + "oxc_regular_expression", "oxc_span", "oxc_syntax", "pico-args", @@ -1802,6 +1802,7 @@ dependencies = [ name = "oxc_regular_expression" version = "0.27.0" dependencies = [ + "bitflags 2.6.0", "oxc_allocator", "oxc_ast_macros", "oxc_diagnostics", @@ -1846,6 +1847,7 @@ dependencies = [ "oxc_diagnostics", "oxc_index", "oxc_parser", + "oxc_regular_expression", "oxc_span", "oxc_syntax", "phf 0.11.2", diff --git a/crates/oxc_ast/Cargo.toml b/crates/oxc_ast/Cargo.toml index f0318135f5668..86840b880b8dc 100644 --- a/crates/oxc_ast/Cargo.toml +++ b/crates/oxc_ast/Cargo.toml @@ -24,9 +24,7 @@ oxc_ast_macros = { workspace = true } oxc_span = { workspace = true } oxc_syntax = { workspace = true } oxc_regular_expression = { workspace = true } - -bitflags = { workspace = true } -num-bigint = { workspace = true } +num-bigint = { workspace = true } serde = { workspace = true, features = ["derive"], optional = true } serde_json = { workspace = true, optional = true } diff --git a/crates/oxc_ast/src/ast/literal.rs b/crates/oxc_ast/src/ast/literal.rs index c4fe6200de64f..4696b81bf2b4f 100644 --- a/crates/oxc_ast/src/ast/literal.rs +++ b/crates/oxc_ast/src/ast/literal.rs @@ -7,12 +7,9 @@ // Silence erroneous warnings from Rust Analyser for `#[derive(Tsify)]` #![allow(non_snake_case)] -use std::hash::Hash; - -use bitflags::bitflags; use oxc_allocator::{Box, CloneIn}; use oxc_ast_macros::ast; -use oxc_regular_expression::ast::Pattern; +use oxc_regular_expression::ast::{Pattern, RegularExpressionFlags}; use oxc_span::{cmp::ContentEq, hash::ContentHash, Atom, GetSpan, GetSpanMut, Span}; use oxc_syntax::number::{BigintBase, NumberBase}; #[cfg(feature = "serialize")] @@ -111,7 +108,7 @@ pub struct RegExp<'a> { /// The regex pattern between the slashes pub pattern: RegExpPattern<'a>, /// Regex flags after the closing slash - pub flags: RegExpFlags, + pub flags: RegularExpressionFlags, } /// A regular expression pattern @@ -152,68 +149,3 @@ pub struct StringLiteral<'a> { pub span: Span, pub value: Atom<'a>, } - -bitflags! { - /// Regular expression flags. - /// - /// - #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] - pub struct RegExpFlags: u8 { - /// Global flag - /// - /// Causes the pattern to match multiple times. - const G = 1 << 0; - /// Ignore case flag - /// - /// Causes the pattern to ignore case. - const I = 1 << 1; - /// Multiline flag - /// - /// Causes `^` and `$` to match the start/end of each line. - const M = 1 << 2; - /// DotAll flag - /// - /// Causes `.` to also match newlines. - const S = 1 << 3; - /// Unicode flag - /// - /// Causes the pattern to treat the input as a sequence of Unicode code points. - const U = 1 << 4; - /// Sticky flag - /// - /// Perform a "sticky" search that matches starting at the current position in the target string. - const Y = 1 << 5; - /// Indices flag - /// - /// Causes the regular expression to generate indices for substring matches. - const D = 1 << 6; - /// Unicode sets flag - /// - /// Similar to the `u` flag, but also enables the `\\p{}` and `\\P{}` syntax. - /// Added by the [`v` flag proposal](https://github.com/tc39/proposal-regexp-set-notation). - const V = 1 << 7; - } -} - -#[cfg(feature = "serialize")] -#[wasm_bindgen::prelude::wasm_bindgen(typescript_custom_section)] -const TS_APPEND_CONTENT: &'static str = r#" -export type RegExpFlags = { - /** Global flag */ - G: 1, - /** Ignore case flag */ - I: 2, - /** Multiline flag */ - M: 4, - /** DotAll flag */ - S: 8, - /** Unicode flag */ - U: 16, - /** Sticky flag */ - Y: 32, - /** Indices flag */ - D: 64, - /** Unicode sets flag */ - V: 128 -}; -"#; diff --git a/crates/oxc_ast/src/ast_impl/literal.rs b/crates/oxc_ast/src/ast_impl/literal.rs index 982fecdaed1d6..6e19cf490bb70 100644 --- a/crates/oxc_ast/src/ast_impl/literal.rs +++ b/crates/oxc_ast/src/ast_impl/literal.rs @@ -9,9 +9,8 @@ use std::{ hash::{Hash, Hasher}, }; -use oxc_allocator::CloneIn; use oxc_regular_expression::ast::Pattern; -use oxc_span::{cmp::ContentEq, hash::ContentHash, Atom, Span}; +use oxc_span::{hash::ContentHash, Atom, Span}; use oxc_syntax::number::NumberBase; use crate::ast::*; @@ -173,92 +172,6 @@ impl<'a> fmt::Display for RegExpPattern<'a> { } } -impl ContentEq for RegExpFlags { - fn content_eq(&self, other: &Self) -> bool { - self == other - } -} - -impl ContentHash for RegExpFlags { - fn content_hash(&self, state: &mut H) { - Hash::hash(self, state); - } -} - -impl<'alloc> CloneIn<'alloc> for RegExpFlags { - type Cloned = Self; - - fn clone_in(&self, _: &'alloc oxc_allocator::Allocator) -> Self::Cloned { - *self - } -} - -impl TryFrom for RegExpFlags { - type Error = char; - - fn try_from(value: char) -> Result { - match value { - 'g' => Ok(Self::G), - 'i' => Ok(Self::I), - 'm' => Ok(Self::M), - 's' => Ok(Self::S), - 'u' => Ok(Self::U), - 'y' => Ok(Self::Y), - 'd' => Ok(Self::D), - 'v' => Ok(Self::V), - _ => Err(value), - } - } -} - -impl TryFrom for RegExpFlags { - type Error = u8; - - fn try_from(value: u8) -> Result { - match value { - b'g' => Ok(Self::G), - b'i' => Ok(Self::I), - b'm' => Ok(Self::M), - b's' => Ok(Self::S), - b'u' => Ok(Self::U), - b'y' => Ok(Self::Y), - b'd' => Ok(Self::D), - b'v' => Ok(Self::V), - _ => Err(value), - } - } -} - -impl fmt::Display for RegExpFlags { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - if self.contains(Self::G) { - write!(f, "g")?; - } - if self.contains(Self::I) { - write!(f, "i")?; - } - if self.contains(Self::M) { - write!(f, "m")?; - } - if self.contains(Self::S) { - write!(f, "s")?; - } - if self.contains(Self::U) { - write!(f, "u")?; - } - if self.contains(Self::Y) { - write!(f, "y")?; - } - if self.contains(Self::D) { - write!(f, "d")?; - } - if self.contains(Self::V) { - write!(f, "v")?; - } - Ok(()) - } -} - impl<'a> StringLiteral<'a> { pub fn new(span: Span, value: Atom<'a>) -> Self { Self { span, value } diff --git a/crates/oxc_ast/src/generated/assert_layouts.rs b/crates/oxc_ast/src/generated/assert_layouts.rs index 344a76cfbb5b4..c0dbd75ae1469 100644 --- a/crates/oxc_ast/src/generated/assert_layouts.rs +++ b/crates/oxc_ast/src/generated/assert_layouts.rs @@ -1412,24 +1412,12 @@ const _: () = { assert!(size_of::() == 1usize); assert!(align_of::() == 1usize); - assert!(size_of::() == 72usize); + assert!(size_of::() == 64usize); assert!(align_of::() == 8usize); assert!(offset_of!(RegularExpression, span) == 0usize); assert!(offset_of!(RegularExpression, pattern) == 8usize); assert!(offset_of!(RegularExpression, flags) == 56usize); - assert!(size_of::() == 16usize); - assert!(align_of::() == 4usize); - assert!(offset_of!(Flags, span) == 0usize); - assert!(offset_of!(Flags, global) == 8usize); - assert!(offset_of!(Flags, ignore_case) == 9usize); - assert!(offset_of!(Flags, multiline) == 10usize); - assert!(offset_of!(Flags, unicode) == 11usize); - assert!(offset_of!(Flags, sticky) == 12usize); - assert!(offset_of!(Flags, dot_all) == 13usize); - assert!(offset_of!(Flags, has_indices) == 14usize); - assert!(offset_of!(Flags, unicode_sets) == 15usize); - assert!(size_of::() == 48usize); assert!(align_of::() == 8usize); assert!(offset_of!(Pattern, span) == 0usize); @@ -2966,24 +2954,12 @@ const _: () = { assert!(size_of::() == 1usize); assert!(align_of::() == 1usize); - assert!(size_of::() == 56usize); + assert!(size_of::() == 44usize); assert!(align_of::() == 4usize); assert!(offset_of!(RegularExpression, span) == 0usize); assert!(offset_of!(RegularExpression, pattern) == 8usize); assert!(offset_of!(RegularExpression, flags) == 40usize); - assert!(size_of::() == 16usize); - assert!(align_of::() == 4usize); - assert!(offset_of!(Flags, span) == 0usize); - assert!(offset_of!(Flags, global) == 8usize); - assert!(offset_of!(Flags, ignore_case) == 9usize); - assert!(offset_of!(Flags, multiline) == 10usize); - assert!(offset_of!(Flags, unicode) == 11usize); - assert!(offset_of!(Flags, sticky) == 12usize); - assert!(offset_of!(Flags, dot_all) == 13usize); - assert!(offset_of!(Flags, has_indices) == 14usize); - assert!(offset_of!(Flags, unicode_sets) == 15usize); - assert!(size_of::() == 32usize); assert!(align_of::() == 4usize); assert!(offset_of!(Pattern, span) == 0usize); diff --git a/crates/oxc_ast/src/serialize.rs b/crates/oxc_ast/src/serialize.rs index 934deaa417966..2d200811691fb 100644 --- a/crates/oxc_ast/src/serialize.rs +++ b/crates/oxc_ast/src/serialize.rs @@ -9,8 +9,8 @@ use crate::ast::{ ArrayAssignmentTarget, ArrayPattern, AssignmentTargetMaybeDefault, AssignmentTargetProperty, AssignmentTargetRest, BindingPattern, BindingPatternKind, BindingProperty, BindingRestElement, Directive, Elision, FormalParameter, FormalParameterKind, FormalParameters, - ObjectAssignmentTarget, ObjectPattern, Program, RegExpFlags, Statement, StringLiteral, - TSModuleBlock, TSTypeAnnotation, + ObjectAssignmentTarget, ObjectPattern, Program, Statement, StringLiteral, TSModuleBlock, + TSTypeAnnotation, }; pub struct EcmaFormatter; @@ -42,15 +42,6 @@ impl<'a> Program<'a> { } } -impl Serialize for RegExpFlags { - fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { - serializer.serialize_str(&self.to_string()) - } -} - /// Serialize `ArrayExpressionElement::Elision` variant as `null` in JSON impl Serialize for Elision { fn serialize(&self, serializer: S) -> Result diff --git a/crates/oxc_linter/src/ast_util.rs b/crates/oxc_linter/src/ast_util.rs index 127fcc26d4596..6e26a638a97c0 100644 --- a/crates/oxc_linter/src/ast_util.rs +++ b/crates/oxc_linter/src/ast_util.rs @@ -1,6 +1,7 @@ use core::hash::Hasher; use oxc_ast::{ast::BindingIdentifier, AstKind}; +use oxc_regular_expression::ast::RegularExpressionFlags; use oxc_semantic::{AstNode, AstNodeId, SymbolId}; use oxc_span::{hash::ContentHash, GetSpan, Span}; use oxc_syntax::operator::{AssignmentOperator, BinaryOperator, LogicalOperator, UnaryOperator}; @@ -290,16 +291,16 @@ pub fn get_symbol_id_of_variable( pub fn extract_regex_flags<'a>( args: &'a oxc_allocator::Vec<'a, Argument<'a>>, -) -> Option { +) -> Option { if args.len() <= 1 { return None; } let Argument::StringLiteral(flag_arg) = &args[1] else { return None; }; - let mut flags = RegExpFlags::empty(); + let mut flags = RegularExpressionFlags::empty(); for ch in flag_arg.value.chars() { - let flag = RegExpFlags::try_from(ch).ok()?; + let flag = RegularExpressionFlags::try_from(ch).ok()?; flags |= flag; } Some(flags) diff --git a/crates/oxc_linter/src/rules/eslint/no_control_regex.rs b/crates/oxc_linter/src/rules/eslint/no_control_regex.rs index 2454e2a1dabd7..cd9e94f36ecab 100644 --- a/crates/oxc_linter/src/rules/eslint/no_control_regex.rs +++ b/crates/oxc_linter/src/rules/eslint/no_control_regex.rs @@ -1,10 +1,11 @@ use lazy_static::lazy_static; use oxc_ast::{ - ast::{Argument, RegExpFlags, RegExpPattern}, + ast::{Argument, RegExpPattern}, AstKind, }; use oxc_diagnostics::OxcDiagnostic; use oxc_macros::declare_oxc_lint; +use oxc_regular_expression::ast::RegularExpressionFlags; use oxc_span::{GetSpan, Span}; use regex::{Matches, Regex}; @@ -101,7 +102,7 @@ impl Rule for NoControlRegex { // extract numeric part from \u{00} if numeric_part.starts_with('{') { let has_unicode_flag = match flags { - Some(flags) if flags.contains(RegExpFlags::U) => true, + Some(flags) if flags.contains(RegularExpressionFlags::U) => true, _ => { continue; } @@ -161,7 +162,7 @@ struct RegexPatternData<'a> { /// /// Note that flags are represented by a `u8` and therefore safely clonable /// with low performance overhead. - flags: Option, + flags: Option, /// The pattern's span. For [`oxc_ast::ast::Expression::NewExpression`]s /// and [`oxc_ast::ast::Expression::CallExpression`]s, /// this will match the entire new/call expression. @@ -176,7 +177,7 @@ struct RegexPatternData<'a> { /// * /foo/ -> "foo" /// * new RegExp("foo") -> foo /// -/// note: [`RegExpFlags`] and [`Span`]s are both tiny and cloneable. +/// note: [`RegularExpressionFlags`] and [`Span`]s are both tiny and cloneable. fn regex_pattern<'a>(node: &AstNode<'a>) -> Option> { let kind = node.kind(); match kind { diff --git a/crates/oxc_linter/src/rules/eslint/no_invalid_regexp.rs b/crates/oxc_linter/src/rules/eslint/no_invalid_regexp.rs index 762fa3d529ac3..c53a3f4c5c1db 100644 --- a/crates/oxc_linter/src/rules/eslint/no_invalid_regexp.rs +++ b/crates/oxc_linter/src/rules/eslint/no_invalid_regexp.rs @@ -2,7 +2,7 @@ use oxc_allocator::Allocator; use oxc_ast::{ast::Argument, AstKind}; use oxc_diagnostics::{LabeledSpan, OxcDiagnostic}; use oxc_macros::declare_oxc_lint; -use oxc_regular_expression::{FlagsParser, ParserOptions, PatternParser}; +use oxc_regular_expression::{FlagsParser, ParserOptions, PatternParser, PatternParserOptions}; use oxc_span::Span; use rustc_hash::FxHashSet; use serde::Deserialize; @@ -131,15 +131,10 @@ impl Rule for NoInvalidRegexp { // Pattern check is skipped when 1st argument is NOT a `StringLiteral` // e.g. `new RegExp(var)`, `RegExp("str" + var)` if let Some((pattern_span_start, pattern_text)) = pattern_arg { - let mut options = ParserOptions::default().with_span_offset(pattern_span_start); - if let Some(flags) = parsed_flags { - if flags.unicode || flags.unicode_sets { - options = options.with_unicode_mode(); - } - if flags.unicode_sets { - options = options.with_unicode_sets_mode(); - } - } + let options = parsed_flags.map_or_else( + || PatternParserOptions::default().with_span_offset(pattern_span_start), + |flags| PatternParserOptions { span_offset: pattern_span_start, flags }, + ); match PatternParser::new(&allocator, pattern_text, options).parse() { Ok(_) => {} Err(diagnostic) => ctx.diagnostic(diagnostic), diff --git a/crates/oxc_linter/src/rules/oxc/bad_replace_all_arg.rs b/crates/oxc_linter/src/rules/oxc/bad_replace_all_arg.rs index 3445c1e024687..441674aa0fc15 100644 --- a/crates/oxc_linter/src/rules/oxc/bad_replace_all_arg.rs +++ b/crates/oxc_linter/src/rules/oxc/bad_replace_all_arg.rs @@ -1,9 +1,7 @@ -use oxc_ast::{ - ast::{Expression, RegExpFlags}, - AstKind, -}; +use oxc_ast::{ast::Expression, AstKind}; use oxc_diagnostics::OxcDiagnostic; use oxc_macros::declare_oxc_lint; +use oxc_regular_expression::ast::RegularExpressionFlags; use oxc_span::Span; use crate::{ @@ -67,7 +65,7 @@ impl Rule for BadReplaceAllArg { return; }; - if !flags.contains(RegExpFlags::G) { + if !flags.contains(RegularExpressionFlags::G) { let Some(call_expr_callee) = call_expr.callee.as_member_expression() else { return; }; @@ -83,7 +81,7 @@ impl Rule for BadReplaceAllArg { fn resolve_flags<'a>( expr: &'a Expression<'a>, ctx: &LintContext<'a>, -) -> Option<(RegExpFlags, Span)> { +) -> Option<(RegularExpressionFlags, Span)> { match expr.without_parentheses() { Expression::RegExpLiteral(regexp_literal) => { Some((regexp_literal.regex.flags, regexp_literal.span)) @@ -91,7 +89,8 @@ fn resolve_flags<'a>( Expression::NewExpression(new_expr) => { if new_expr.callee.is_specific_id("RegExp") { Some(( - extract_regex_flags(&new_expr.arguments).unwrap_or(RegExpFlags::empty()), + extract_regex_flags(&new_expr.arguments) + .unwrap_or(RegularExpressionFlags::empty()), new_expr.span, )) } else { diff --git a/crates/oxc_linter/src/rules/unicorn/prefer_string_replace_all.rs b/crates/oxc_linter/src/rules/unicorn/prefer_string_replace_all.rs index 4452be0950c6c..f88eb69b2186d 100644 --- a/crates/oxc_linter/src/rules/unicorn/prefer_string_replace_all.rs +++ b/crates/oxc_linter/src/rules/unicorn/prefer_string_replace_all.rs @@ -1,9 +1,10 @@ use oxc_ast::{ - ast::{Argument, MemberExpression, RegExpFlags}, + ast::{Argument, MemberExpression}, AstKind, }; use oxc_diagnostics::OxcDiagnostic; use oxc_macros::declare_oxc_lint; +use oxc_regular_expression::ast::RegularExpressionFlags; use oxc_span::{CompactStr, GetSpan, Span}; use crate::{ast_util::extract_regex_flags, context::LintContext, rule::Rule, AstNode}; @@ -97,7 +98,7 @@ impl Rule for PreferStringReplaceAll { fn is_reg_exp_with_global_flag<'a>(expr: &'a Argument<'a>) -> bool { if let Argument::RegExpLiteral(reg_exp_literal) = expr { - return reg_exp_literal.regex.flags.contains(RegExpFlags::G); + return reg_exp_literal.regex.flags.contains(RegularExpressionFlags::G); } if let Argument::NewExpression(new_expr) = expr { @@ -106,7 +107,7 @@ fn is_reg_exp_with_global_flag<'a>(expr: &'a Argument<'a>) -> bool { } if let Some(flags) = extract_regex_flags(&new_expr.arguments) { - return flags.contains(RegExpFlags::G); + return flags.contains(RegularExpressionFlags::G); } } @@ -121,7 +122,7 @@ fn get_pattern_replacement<'a>( return None; }; - if !reg_exp_literal.regex.flags.contains(RegExpFlags::G) { + if !reg_exp_literal.regex.flags.contains(RegularExpressionFlags::G) { return None; } diff --git a/crates/oxc_linter/src/rules/unicorn/prefer_string_starts_ends_with.rs b/crates/oxc_linter/src/rules/unicorn/prefer_string_starts_ends_with.rs index 0986578567417..30cfa9b6cb0d7 100644 --- a/crates/oxc_linter/src/rules/unicorn/prefer_string_starts_ends_with.rs +++ b/crates/oxc_linter/src/rules/unicorn/prefer_string_starts_ends_with.rs @@ -1,9 +1,10 @@ use oxc_ast::{ - ast::{CallExpression, Expression, MemberExpression, RegExpFlags, RegExpLiteral}, + ast::{CallExpression, Expression, MemberExpression, RegExpLiteral}, AstKind, }; use oxc_diagnostics::OxcDiagnostic; use oxc_macros::declare_oxc_lint; +use oxc_regular_expression::ast::RegularExpressionFlags; use oxc_span::{GetSpan, Span}; use crate::{ @@ -139,8 +140,11 @@ enum ErrorKind { } fn check_regex(regexp_lit: &RegExpLiteral, pattern_text: &str) -> Option { - if regexp_lit.regex.flags.intersects(RegExpFlags::M) - || (regexp_lit.regex.flags.intersects(RegExpFlags::I | RegExpFlags::M) + if regexp_lit.regex.flags.intersects(RegularExpressionFlags::M) + || (regexp_lit + .regex + .flags + .intersects(RegularExpressionFlags::I | RegularExpressionFlags::M) && is_useless_case_sensitive_regex_flag(pattern_text)) { return None; diff --git a/crates/oxc_parser/examples/regular_expression.rs b/crates/oxc_parser/examples/regular_expression.rs index a26225afb24cc..8ed0900fcc598 100644 --- a/crates/oxc_parser/examples/regular_expression.rs +++ b/crates/oxc_parser/examples/regular_expression.rs @@ -4,7 +4,7 @@ use std::{env, fs, path::Path, sync::Arc}; use oxc_allocator::Allocator; use oxc_ast::{ast, AstKind, Visit}; use oxc_parser::{ParseOptions, Parser}; -use oxc_regular_expression::{FlagsParser, ParserOptions, PatternParser}; +use oxc_regular_expression::{FlagsParser, ParserOptions, PatternParser, PatternParserOptions}; use oxc_span::SourceType; // `cargo run -p oxc_parser --example regular_expression` @@ -86,10 +86,9 @@ impl<'a> Visit<'a> for RegularExpressionVisitor { let parsed = PatternParser::new( &allocator, pattern, - ParserOptions { + PatternParserOptions { span_offset: new_expr.span.start + 12, // = "new RegExp(\"".len() - unicode_mode: flags.unicode || flags.unicode_sets, - unicode_sets_mode: flags.unicode_sets, + flags, }, ) .parse(); diff --git a/crates/oxc_parser/src/cursor.rs b/crates/oxc_parser/src/cursor.rs index 7cf354c7df236..e44723a1ac6f2 100644 --- a/crates/oxc_parser/src/cursor.rs +++ b/crates/oxc_parser/src/cursor.rs @@ -1,8 +1,9 @@ //! Code related to navigating `Token`s from the lexer use oxc_allocator::Vec; -use oxc_ast::ast::{Decorator, RegExpFlags}; +use oxc_ast::ast::Decorator; use oxc_diagnostics::Result; +use oxc_regular_expression::ast::RegularExpressionFlags; use oxc_span::{GetSpan, Span}; use crate::{ @@ -223,7 +224,7 @@ impl<'a> ParserImpl<'a> { } /// Tell lexer to read a regex - pub(crate) fn read_regex(&mut self) -> Result<(u32, RegExpFlags)> { + pub(crate) fn read_regex(&mut self) -> Result<(u32, RegularExpressionFlags)> { let (token, pattern_end, flags) = self.lexer.next_regex(self.cur_kind())?; self.token = token; Ok((pattern_end, flags)) diff --git a/crates/oxc_parser/src/js/expression.rs b/crates/oxc_parser/src/js/expression.rs index 63a0c360de6b6..1daf4cb43c6a1 100644 --- a/crates/oxc_parser/src/js/expression.rs +++ b/crates/oxc_parser/src/js/expression.rs @@ -1,7 +1,7 @@ use oxc_allocator::Box; use oxc_ast::ast::*; use oxc_diagnostics::Result; -use oxc_regular_expression::ast::Pattern; +use oxc_regular_expression::ast::{Pattern, RegularExpressionFlags}; use oxc_span::{Atom, Span}; use oxc_syntax::{ number::{BigintBase, NumberBase}, @@ -363,14 +363,10 @@ impl<'a> ParserImpl<'a> { &mut self, span_offset: u32, pattern: &'a str, - flags: RegExpFlags, + flags: RegularExpressionFlags, ) -> Option>> { - use oxc_regular_expression::{ParserOptions, PatternParser}; - let options = ParserOptions { - span_offset, - unicode_mode: flags.contains(RegExpFlags::U) || flags.contains(RegExpFlags::V), - unicode_sets_mode: flags.contains(RegExpFlags::V), - }; + use oxc_regular_expression::{PatternParser, PatternParserOptions}; + let options = PatternParserOptions { span_offset, flags }; match PatternParser::new(self.ast.allocator, pattern, options).parse() { Ok(regular_expression) => Some(self.ast.alloc(regular_expression)), Err(diagnostic) => { diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs index 307f7287c04af..9dd1d11623705 100644 --- a/crates/oxc_parser/src/lexer/mod.rs +++ b/crates/oxc_parser/src/lexer/mod.rs @@ -30,7 +30,6 @@ mod whitespace; use std::collections::VecDeque; use oxc_allocator::Allocator; -use oxc_ast::ast::RegExpFlags; use oxc_diagnostics::OxcDiagnostic; use oxc_span::{SourceType, Span}; use rustc_hash::FxHashMap; diff --git a/crates/oxc_parser/src/lexer/regex.rs b/crates/oxc_parser/src/lexer/regex.rs index 8d009124d9aea..afdd976dae4be 100644 --- a/crates/oxc_parser/src/lexer/regex.rs +++ b/crates/oxc_parser/src/lexer/regex.rs @@ -1,7 +1,8 @@ use oxc_diagnostics::Result; +use oxc_regular_expression::ast::RegularExpressionFlags; use oxc_syntax::identifier::is_line_terminator; -use super::{Kind, Lexer, RegExpFlags, Token}; +use super::{Kind, Lexer, Token}; use crate::diagnostics; impl<'a> Lexer<'a> { @@ -11,7 +12,10 @@ impl<'a> Lexer<'a> { /// where a `RegularExpressionLiteral` is permitted /// Which means the parser needs to re-tokenize on `PrimaryExpression`, /// `RegularExpressionLiteral` only appear on the right hand side of `PrimaryExpression` - pub(crate) fn next_regex(&mut self, kind: Kind) -> Result<(Token, u32, RegExpFlags)> { + pub(crate) fn next_regex( + &mut self, + kind: Kind, + ) -> Result<(Token, u32, RegularExpressionFlags)> { self.token.start = self.offset() - match kind { Kind::Slash => 1, @@ -25,14 +29,14 @@ impl<'a> Lexer<'a> { } /// 12.9.5 Regular Expression Literals - fn read_regex(&mut self) -> Result<(u32, RegExpFlags)> { + fn read_regex(&mut self) -> Result<(u32, RegularExpressionFlags)> { let mut in_escape = false; let mut in_character_class = false; loop { match self.next_char() { None => { return Err(diagnostics::unterminated_reg_exp(self.unterminated_range())); - // return (self.offset(), RegExpFlags::empty()); + // return (self.offset(), RegularExpressionFlags::empty()); } Some(c) if is_line_terminator(c) => { return Err(diagnostics::unterminated_reg_exp(self.unterminated_range())); @@ -54,13 +58,13 @@ impl<'a> Lexer<'a> { } let pattern_end = self.offset() - 1; // -1 to exclude `/` - let mut flags = RegExpFlags::empty(); + let mut flags = RegularExpressionFlags::empty(); while let Some(b @ (b'$' | b'_' | b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9')) = self.peek_byte() { self.consume_char(); - let Ok(flag) = RegExpFlags::try_from(b) else { + let Ok(flag) = RegularExpressionFlags::try_from(b) else { self.error(diagnostics::reg_exp_flag( b as char, self.current_offset().expand_left(1), diff --git a/crates/oxc_prettier/Cargo.toml b/crates/oxc_prettier/Cargo.toml index 3ee51f982585f..702289beedfaf 100644 --- a/crates/oxc_prettier/Cargo.toml +++ b/crates/oxc_prettier/Cargo.toml @@ -20,10 +20,11 @@ workspace = true doctest = false [dependencies] -oxc_allocator = { workspace = true } -oxc_ast = { workspace = true } -oxc_syntax = { workspace = true } -oxc_span = { workspace = true } +oxc_allocator = { workspace = true } +oxc_ast = { workspace = true } +oxc_syntax = { workspace = true } +oxc_span = { workspace = true } +oxc_regular_expression = { workspace = true } bitflags = { workspace = true } diff --git a/crates/oxc_prettier/src/format/mod.rs b/crates/oxc_prettier/src/format/mod.rs index ee1a4892f10b1..28d73d745381e 100644 --- a/crates/oxc_prettier/src/format/mod.rs +++ b/crates/oxc_prettier/src/format/mod.rs @@ -28,6 +28,7 @@ use std::borrow::Cow; use oxc_allocator::{Box, Vec}; use oxc_ast::{ast::*, AstKind}; +use oxc_regular_expression::ast::RegularExpressionFlags; use oxc_span::GetSpan; use oxc_syntax::identifier::is_identifier_name; @@ -2265,7 +2266,7 @@ impl<'a> Format<'a> for AssignmentPattern<'a> { } } -impl<'a> Format<'a> for RegExpFlags { +impl<'a> Format<'a> for RegularExpressionFlags { fn format(&self, p: &mut Prettier<'a>) -> Doc<'a> { let mut string = std::vec::Vec::with_capacity(self.iter().count()); if self.contains(Self::D) { diff --git a/crates/oxc_regular_expression/Cargo.toml b/crates/oxc_regular_expression/Cargo.toml index 3c979ab601bb0..3f39d0c03ea12 100644 --- a/crates/oxc_regular_expression/Cargo.toml +++ b/crates/oxc_regular_expression/Cargo.toml @@ -28,6 +28,7 @@ oxc_ast_macros = { workspace = true } phf = { workspace = true, features = ["macros"] } rustc-hash = { workspace = true } unicode-id-start = { workspace = true } +bitflags = { workspace = true } serde = { workspace = true, features = ["derive"], optional = true } tsify = { workspace = true, optional = true } diff --git a/crates/oxc_regular_expression/examples/parse_literal.rs b/crates/oxc_regular_expression/examples/parse_literal.rs index 26b50b9146415..7404554a0d58e 100644 --- a/crates/oxc_regular_expression/examples/parse_literal.rs +++ b/crates/oxc_regular_expression/examples/parse_literal.rs @@ -47,10 +47,10 @@ fn main() { let ret = parser.parse(); match ret { - Ok(ast::RegularExpression { pattern, flags, .. }) => { + Ok(ref regex @ ast::RegularExpression { ref pattern, flags, .. }) => { println!("✨ {}", pattern.span.source_text(source_text)); println!("{pattern:#?}"); - println!("✨ {}", flags.span.source_text(source_text)); + println!("✨ {}", regex.flags_span().source_text(source_text)); println!("{flags:?}"); } Err(error) => { diff --git a/crates/oxc_regular_expression/src/ast.rs b/crates/oxc_regular_expression/src/ast.rs index 5db3e44200687..f2540b0e50cee 100644 --- a/crates/oxc_regular_expression/src/ast.rs +++ b/crates/oxc_regular_expression/src/ast.rs @@ -5,6 +5,9 @@ // Silence erroneous warnings from Rust Analyser for `#[derive(Tsify)]` #![allow(non_snake_case)] +use std::hash::{Hash, Hasher}; + +use bitflags::bitflags; use oxc_allocator::{Box, CloneIn, Vec}; use oxc_ast_macros::ast; use oxc_span::{cmp::ContentEq, hash::ContentHash, Atom, Span}; @@ -20,23 +23,16 @@ use tsify::Tsify; pub struct RegularExpression<'a> { pub span: Span, pub pattern: Pattern<'a>, - pub flags: Flags, + pub flags: RegularExpressionFlags, } -#[ast] -#[derive(Debug, Clone)] -#[generate_derive(CloneIn, ContentEq, ContentHash)] -#[cfg_attr(feature = "serialize", derive(Serialize, Tsify))] -pub struct Flags { - pub span: Span, - pub global: bool, - pub ignore_case: bool, - pub multiline: bool, - pub unicode: bool, - pub sticky: bool, - pub dot_all: bool, - pub has_indices: bool, - pub unicode_sets: bool, +impl<'a> RegularExpression<'a> { + pub fn flags_span(&self) -> Span { + Span::new( + /* + 1 to skip the `/` in the middle */ self.pattern.span.end + 1, + self.span.end, + ) + } } /// The root of the `PatternParser` result. @@ -360,3 +356,98 @@ pub struct NamedReference<'a> { pub span: Span, pub name: Atom<'a>, } + +bitflags! { + /// Regular expression flags. + /// + /// + #[derive(Debug, Default, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] + pub struct RegularExpressionFlags: u8 { + /// Global flag + /// + /// Causes the pattern to match multiple times. + const G = 1 << 0; + /// Ignore case flag + /// + /// Causes the pattern to ignore case. + const I = 1 << 1; + /// Multiline flag + /// + /// Causes `^` and `$` to match the start/end of each line. + const M = 1 << 2; + /// DotAll flag + /// + /// Causes `.` to also match newlines. + const S = 1 << 3; + /// Unicode flag + /// + /// Causes the pattern to treat the input as a sequence of Unicode code points. + const U = 1 << 4; + /// Sticky flag + /// + /// Perform a "sticky" search that matches starting at the current position in the target string. + const Y = 1 << 5; + /// Indices flag + /// + /// Causes the regular expression to generate indices for substring matches. + const D = 1 << 6; + /// Unicode sets flag + /// + /// Similar to the `u` flag, but also enables the `\\p{}` and `\\P{}` syntax. + /// Added by the [`v` flag proposal](https://github.com/tc39/proposal-regexp-set-notation). + const V = 1 << 7; + } +} + +impl ContentEq for RegularExpressionFlags { + fn content_eq(&self, other: &Self) -> bool { + self == other + } +} + +impl ContentHash for RegularExpressionFlags { + fn content_hash(&self, state: &mut H) { + Hash::hash(self, state); + } +} + +impl<'alloc> CloneIn<'alloc> for RegularExpressionFlags { + type Cloned = Self; + + fn clone_in(&self, _: &'alloc oxc_allocator::Allocator) -> Self::Cloned { + *self + } +} + +#[cfg(feature = "serialize")] +impl serde::Serialize for RegularExpressionFlags { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + serializer.serialize_str(&self.to_string()) + } +} + +#[cfg(feature = "serialize")] +#[wasm_bindgen::prelude::wasm_bindgen(typescript_custom_section)] +const TS_APPEND_CONTENT: &'static str = r#" +export type RegularExpressionFlags = { + /** Global flag */ + G: 1, + /** Ignore case flag */ + I: 2, + /** Multiline flag */ + M: 4, + /** DotAll flag */ + S: 8, + /** Unicode flag */ + U: 16, + /** Sticky flag */ + Y: 32, + /** Indices flag */ + D: 64, + /** Unicode sets flag */ + V: 128 +}; +"#; diff --git a/crates/oxc_regular_expression/src/body_parser/mod.rs b/crates/oxc_regular_expression/src/body_parser/mod.rs index 00cc7f969bcb0..96c0aa5a811a4 100644 --- a/crates/oxc_regular_expression/src/body_parser/mod.rs +++ b/crates/oxc_regular_expression/src/body_parser/mod.rs @@ -10,113 +10,111 @@ pub use parser::PatternParser; mod test { use oxc_allocator::Allocator; - use crate::{ParserOptions, PatternParser}; + use crate::{ast::RegularExpressionFlags, PatternParser, PatternParserOptions}; + + const DEFAULT_OPTIONS: PatternParserOptions = + PatternParserOptions { span_offset: 0, flags: RegularExpressionFlags::empty() }; + const UNICODE_OPTIONS: PatternParserOptions = + PatternParserOptions { span_offset: 0, flags: RegularExpressionFlags::U }; + const UNICODE_SET_OPTIONS: PatternParserOptions = + PatternParserOptions { span_offset: 0, flags: RegularExpressionFlags::V }; #[test] fn should_pass() { let allocator = Allocator::default(); for (source_text, options) in &[ - ("", ParserOptions::default()), - ("a", ParserOptions::default()), - ("a+", ParserOptions::default()), - ("a*", ParserOptions::default()), - ("a?", ParserOptions::default()), - ("^$^$^$", ParserOptions::default()), - ("(?=a){1}", ParserOptions::default()), - ("(?!a){1}", ParserOptions::default()), - ("a{1}", ParserOptions::default()), - ("a{1", ParserOptions::default()), - ("a|{", ParserOptions::default()), - ("a{", ParserOptions::default()), - ("a{,", ParserOptions::default()), - ("a{1,", ParserOptions::default()), - ("a{1,}", ParserOptions::default()), - ("a{1,2}", ParserOptions::default()), - ("x{9007199254740991}", ParserOptions::default()), - ("x{9007199254740991,9007199254740991}", ParserOptions::default()), - ("a|b", ParserOptions::default()), - ("a|b|c", ParserOptions::default()), - ("a|b+?|c", ParserOptions::default()), - ("a+b*?c{1}d{2,}e{3,4}?", ParserOptions::default()), - (r"^(?=ab)\b(?!cd)(?<=ef)\B(?.)\x1f", ParserOptions::default()), - ("a]", ParserOptions::default()), - ("a}", ParserOptions::default()), - ("]", ParserOptions::default()), - ("[]", ParserOptions::default()), - ("[a]", ParserOptions::default()), - ("[ab]", ParserOptions::default()), - ("[a-b]", ParserOptions::default()), - ("[-]", ParserOptions::default()), - ("[a-]", ParserOptions::default()), - ("[-a]", ParserOptions::default()), - ("[-a-]", ParserOptions::default()), - (r"[a\-b]", ParserOptions::default()), - (r"[-a-b]", ParserOptions::default()), - (r"[a-b-]", ParserOptions::default()), - (r"[a\-b-]", ParserOptions::default()), - (r"[\[\]\-]", ParserOptions::default()), - ("[a-z0-9]", ParserOptions::default()), - ("[a-a]", ParserOptions::default()), - (r"[\d-\D]", ParserOptions::default()), - (r"^([\ud801[\udc28-\udc4f])$", ParserOptions::default()), - (r"[a-c]]", ParserOptions::default()), + (r"\p{Emoji_Presentation}\P{Script_Extensions=Latin}\p{Sc}|\p{P}", UNICODE_OPTIONS), + (r"^\p{General_Category=cntrl}+$", UNICODE_OPTIONS), + (r"\p{Basic_Emoji}", UNICODE_SET_OPTIONS), + (r"\n\cM\0\x41\u1f60\.\/", DEFAULT_OPTIONS), + (r"\c0", DEFAULT_OPTIONS), + (r"\0", DEFAULT_OPTIONS), + (r"\0", UNICODE_OPTIONS), + (r"\u", DEFAULT_OPTIONS), + (r"\u{", DEFAULT_OPTIONS), + (r"\u{}", DEFAULT_OPTIONS), + (r"\u{0}", DEFAULT_OPTIONS), + (r"\u{1f600}", DEFAULT_OPTIONS), + (r"\u{1f600}", UNICODE_OPTIONS), + ("(?:abc)", DEFAULT_OPTIONS), + (r"(?<\u{1d49c}>.)\x1f", DEFAULT_OPTIONS), + ("a]", DEFAULT_OPTIONS), + ("a}", DEFAULT_OPTIONS), + ("]", DEFAULT_OPTIONS), + ("[]", DEFAULT_OPTIONS), + ("[a]", DEFAULT_OPTIONS), + ("[ab]", DEFAULT_OPTIONS), + ("[a-b]", DEFAULT_OPTIONS), + ("[-]", DEFAULT_OPTIONS), + ("[a-]", DEFAULT_OPTIONS), + ("[-a]", DEFAULT_OPTIONS), + ("[-a-]", DEFAULT_OPTIONS), + (r"[a\-b]", DEFAULT_OPTIONS), + (r"[-a-b]", DEFAULT_OPTIONS), + (r"[a-b-]", DEFAULT_OPTIONS), + (r"[a\-b-]", DEFAULT_OPTIONS), + (r"[\[\]\-]", DEFAULT_OPTIONS), + ("[a-z0-9]", DEFAULT_OPTIONS), + ("[a-a]", DEFAULT_OPTIONS), + (r"[\d-\D]", DEFAULT_OPTIONS), + (r"^([\ud801[\udc28-\udc4f])$", DEFAULT_OPTIONS), + (r"[a-c]]", DEFAULT_OPTIONS), ( r"[ϗϙϛϝϟϡϣϥϧϩϫϭϯ-ϳϵϸϻ-ϼа-џѡѣѥѧѩѫѭѯѱѳѵѷѹѻѽѿҁҋҍҏґғҕҗҙқҝҟҡңҥҧҩҫҭүұҳҵҷҹһҽҿӂӄӆӈӊӌӎ-ӏӑӓӕӗәӛӝӟӡӣӥӧөӫӭӯӱӳӵӷӹӻӽӿԁԃԅԇԉԋԍԏԑԓԕԗԙԛԝԟԡԣա-ևᴀ-ᴫᵢ-ᵷᵹ-ᶚḁḃḅḇḉḋḍḏḑḓḕḗḙḛḝḟḡḣḥḧḩḫḭḯḱḳḵḷḹḻḽḿṁṃṅṇṉṋṍṏṑṓṕṗṙṛṝṟṡṣṥṧṩṫṭṯṱṳṵṷṹṻṽṿẁẃẅẇẉẋẍẏẑẓẕ-ẝẟạảấầẩẫậắằẳẵặẹẻẽếềểễệỉịọỏốồổỗộớờởỡợụủứừửữựỳỵỷỹỻỽỿ-ἇἐ-ἕἠ-ἧἰ-ἷὀ-ὅὐ-ὗὠ-ὧὰ]", - ParserOptions::default(), - ), - (r"[a-z0-9[.\\]]", ParserOptions::default().with_unicode_sets_mode()), - (r"[a&&b&&c]", ParserOptions::default().with_unicode_sets_mode()), - (r"[a--b--c]", ParserOptions::default().with_unicode_sets_mode()), - (r"[[a-z]--b--c]", ParserOptions::default().with_unicode_sets_mode()), - ( - r"[[[[[[[[[[[[[[[[[[[[[[[[a]]]]]]]]]]]]]]]]]]]]]]]]", - ParserOptions::default().with_unicode_sets_mode(), + DEFAULT_OPTIONS, ), - ( - r"[\q{}\q{a}\q{bc}\q{d|e|f}\q{|||}]", - ParserOptions::default().with_unicode_sets_mode(), - ), - (r"(?A)\k", ParserOptions::default()), - (r"(?)\k", ParserOptions::default()), - (r"\k", ParserOptions::default()), - (r"\k<4>", ParserOptions::default()), - (r"\k", ParserOptions::default()), - (r"(?)\k", ParserOptions::default()), - (r"(?)\k", ParserOptions::default().with_unicode_mode()), - (r"\1", ParserOptions::default()), - (r"\1()", ParserOptions::default()), - (r"\1()", ParserOptions::default().with_unicode_mode()), - (r"(?..)(?..)", ParserOptions::default()), + (r"[a-z0-9[.\\]]", UNICODE_SET_OPTIONS), + (r"[a&&b&&c]", UNICODE_SET_OPTIONS), + (r"[a--b--c]", UNICODE_SET_OPTIONS), + (r"[[a-z]--b--c]", UNICODE_SET_OPTIONS), + (r"[[[[[[[[[[[[[[[[[[[[[[[[a]]]]]]]]]]]]]]]]]]]]]]]]", UNICODE_SET_OPTIONS), + (r"[\q{}\q{a}\q{bc}\q{d|e|f}\q{|||}]", UNICODE_SET_OPTIONS), + (r"(?A)\k", DEFAULT_OPTIONS), + (r"(?)\k", DEFAULT_OPTIONS), + (r"\k", DEFAULT_OPTIONS), + (r"\k<4>", DEFAULT_OPTIONS), + (r"\k", DEFAULT_OPTIONS), + (r"(?)\k", DEFAULT_OPTIONS), + (r"(?)\k", UNICODE_OPTIONS), + (r"\1", DEFAULT_OPTIONS), + (r"\1()", DEFAULT_OPTIONS), + (r"\1()", UNICODE_OPTIONS), + (r"(?..)(?..)", DEFAULT_OPTIONS), // TODO: ES2025 Duplicate named capturing groups - // (r"(?..)|(?..)", ParserOptions::default()), - // (r"(?[0-9]{4})-[0-9]{2}|[0-9]{2}-(?[0-9]{4})", ParserOptions::default()), - // (r"(?:(?x)|(?y))\k", ParserOptions::default()), + // (r"(?..)|(?..)", DEFAULT_OPTIONS), + // (r"(?[0-9]{4})-[0-9]{2}|[0-9]{2}-(?[0-9]{4})", DEFAULT_OPTIONS), + // (r"(?:(?x)|(?y))\k", DEFAULT_OPTIONS), ] { let res = PatternParser::new(&allocator, source_text, *options).parse(); if let Err(err) = res { @@ -130,65 +128,65 @@ mod test { let allocator = Allocator::default(); for (source_text, options) in &[ - ("a)", ParserOptions::default()), - (r"a\", ParserOptions::default()), - ("a]", ParserOptions::default().with_unicode_mode()), - ("a}", ParserOptions::default().with_unicode_mode()), - ("a|+", ParserOptions::default()), - ("a|{", ParserOptions::default().with_unicode_mode()), - ("a{", ParserOptions::default().with_unicode_mode()), - ("a{1", ParserOptions::default().with_unicode_mode()), - ("a{1,", ParserOptions::default().with_unicode_mode()), - ("a{,", ParserOptions::default().with_unicode_mode()), - ("x{9007199254740992}", ParserOptions::default()), - ("x{9007199254740991,9007199254740992}", ParserOptions::default()), - ("x{99999999999999999999999999999999999999999999999999}", ParserOptions::default()), - (r"\99999999999999999999999999999999999999999999999999", ParserOptions::default()), - (r"\u{FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF}", ParserOptions::default().with_unicode_mode()), - ("(?=a", ParserOptions::default()), - ("(?", ParserOptions::default().with_unicode_mode()), - (r"\k<4>", ParserOptions::default().with_unicode_mode()), - (r"\k", ParserOptions::default().with_unicode_mode()), - ("a(?:", ParserOptions::default()), - ("(a", ParserOptions::default()), - ("(?", ParserOptions::default()), - (r"(?.)", ParserOptions::default()), - (r"(?.)", ParserOptions::default().with_unicode_mode()), - (r"(?<\>.)", ParserOptions::default()), - (r"(?<\>.)", ParserOptions::default().with_unicode_mode()), - ("(?)", ParserOptions::default()), - ("(?=a){1}", ParserOptions::default().with_unicode_mode()), - ("(?!a){1}", ParserOptions::default().with_unicode_mode()), - (r"[\d-\D]", ParserOptions::default().with_unicode_mode()), - ("[", ParserOptions::default()), - ("[", ParserOptions::default().with_unicode_sets_mode()), - ("[[", ParserOptions::default().with_unicode_sets_mode()), - ("[[]", ParserOptions::default().with_unicode_sets_mode()), - ("[z-a]", ParserOptions::default()), - (r"[a-c]]", ParserOptions::default().with_unicode_mode()), + ("a)", DEFAULT_OPTIONS), + (r"a\", DEFAULT_OPTIONS), + ("a]", UNICODE_OPTIONS), + ("a}", UNICODE_OPTIONS), + ("a|+", DEFAULT_OPTIONS), + ("a|{", UNICODE_OPTIONS), + ("a{", UNICODE_OPTIONS), + ("a{1", UNICODE_OPTIONS), + ("a{1,", UNICODE_OPTIONS), + ("a{,", UNICODE_OPTIONS), + ("x{9007199254740992}", DEFAULT_OPTIONS), + ("x{9007199254740991,9007199254740992}", DEFAULT_OPTIONS), + ("x{99999999999999999999999999999999999999999999999999}", DEFAULT_OPTIONS), + (r"\99999999999999999999999999999999999999999999999999", DEFAULT_OPTIONS), + (r"\u{FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF}", UNICODE_OPTIONS), + ("(?=a", DEFAULT_OPTIONS), + ("(?", UNICODE_OPTIONS), + (r"\k<4>", UNICODE_OPTIONS), + (r"\k", UNICODE_OPTIONS), + ("a(?:", DEFAULT_OPTIONS), + ("(a", DEFAULT_OPTIONS), + ("(?", DEFAULT_OPTIONS), + (r"(?.)", DEFAULT_OPTIONS), + (r"(?.)", UNICODE_OPTIONS), + (r"(?<\>.)", DEFAULT_OPTIONS), + (r"(?<\>.)", UNICODE_OPTIONS), + ("(?)", DEFAULT_OPTIONS), + ("(?=a){1}", UNICODE_OPTIONS), + ("(?!a){1}", UNICODE_OPTIONS), + (r"[\d-\D]", UNICODE_OPTIONS), + ("[", DEFAULT_OPTIONS), + ("[", UNICODE_SET_OPTIONS), + ("[[", UNICODE_SET_OPTIONS), + ("[[]", UNICODE_SET_OPTIONS), + ("[z-a]", DEFAULT_OPTIONS), + (r"[a-c]]", UNICODE_OPTIONS), ( r"^([a-zªµºß-öø-ÿāăąćĉċčďđēĕėęěĝğġģĥħĩīĭįıijĵķ-ĸĺļľŀłńņň-ʼnŋōŏőœŕŗřśŝşšţťŧũūŭůűųŵŷźżž-ƀƃƅƈƌ-ƍƒƕƙ-ƛƞơƣƥƨƪ-ƫƭưƴƶƹ-ƺƽ-ƿdžljnjǎǐǒǔǖǘǚǜ-ǝǟǡǣǥǧǩǫǭǯ-ǰdzǵǹǻǽǿȁȃȅȇȉȋȍȏȑȓȕȗșțȝȟȡȣȥȧȩȫȭȯȱȳ-ȹȼȿ-ɀɂɇɉɋɍɏ-ʓʕ-ʯͱͳͷͻ-ͽΐά-ώϐ-ϑϕ-ϗϙϛϝϟϡϣϥϧϩϫϭϯ-ϳϵϸϻ-ϼа-џѡѣѥѧѩѫѭѯѱѳѵѷѹѻѽѿҁҋҍҏґғҕҗҙқҝҟҡңҥҧҩҫҭүұҳҵҷҹһҽҿӂӄӆӈӊӌӎ-ӏӑӓӕӗәӛӝӟӡӣӥӧөӫӭӯӱӳӵӷӹӻӽӿԁԃԅԇԉԋԍԏԑԓԕԗԙԛԝԟԡԣա-ևᴀ-ᴫᵢ-ᵷᵹ-ᶚḁḃḅḇḉḋḍḏḑḓḕḗḙḛḝḟḡḣḥḧḩḫḭḯḱḳḵḷḹḻḽḿṁṃṅṇṉṋṍṏṑṓṕṗṙṛṝṟṡṣṥṧṩṫṭṯṱṳṵṷṹṻṽṿẁẃẅẇẉẋẍẏẑẓẕ-ẝẟạảấầẩẫậắằẳẵặẹẻẽếềểễệỉịọỏốồổỗộớờởỡợụủứừửữựỳỵỷỹỻỽỿ-ἇἐ-ἕἠ-ἧἰ-ἷὀ-ὅὐ-ὗὠ-ὧὰ-ώᾀ-ᾇᾐ-ᾗᾠ-ᾧᾰ-ᾴᾶ-ᾷιῂ-ῄῆ-ῇῐ-ΐῖ-ῗῠ-ῧῲ-ῴῶ-ῷⁱⁿℊℎ-ℏℓℯℴℹℼ-ℽⅆ-ⅉⅎↄⰰ-ⱞⱡⱥ-ⱦⱨⱪⱬⱱⱳ-ⱴⱶ-ⱼⲁⲃⲅⲇⲉⲋⲍⲏⲑⲓⲕⲗⲙⲛⲝⲟⲡⲣⲥⲧⲩⲫⲭⲯⲱⲳⲵⲷⲹⲻⲽⲿⳁⳃⳅⳇⳉⳋⳍⳏⳑⳓⳕⳗⳙⳛⳝⳟⳡⳣ-ⳤⴀ-ⴥꙁꙃꙅꙇꙉꙋꙍꙏꙑꙓꙕꙗꙙꙛꙝꙟꙣꙥꙧꙩꙫꙭꚁꚃꚅꚇꚉꚋꚍꚏꚑꚓꚕꚗꜣꜥꜧꜩꜫꜭꜯ-ꜱꜳꜵꜷꜹꜻꜽꜿꝁꝃꝅꝇꝉꝋꝍꝏꝑꝓꝕꝗꝙꝛꝝꝟꝡꝣꝥꝧꝩꝫꝭꝯꝱ-ꝸꝺꝼꝿꞁꞃꞅꞇꞌff-stﬓ-ﬗa-z]|\ud801[\udc28-\udc4f]|\ud835[\udc1a-\udc33\udc4e-\udc54\udc56-\udc67\udc82-\udc9b\udcb6-\udcb9\udcbb\udcbd-\udcc3\udcc5-\udccf\udcea-\udd03\udd1e-\udd37\udd52-\udd6b\udd86-\udd9f\uddba-\uddd3\uddee-\ude07\ude22-\ude3b\ude56-\ude6f\ude8a-\udea5\udec2-\udeda\udedc-\udee1\udefc-\udf14\udf16-\udf1b\udf36-\udf4e\udf50-\udf55\udf70-\udf88\udf8a-\udf8f\udfaa-\udfc2\udfc4-\udfc9\udfcb])$", - ParserOptions::default(), + DEFAULT_OPTIONS, ), - (r"[[\d-\D]]", ParserOptions::default().with_unicode_sets_mode()), - (r"[a&&b--c]", ParserOptions::default().with_unicode_sets_mode()), - (r"[a--b&&c]", ParserOptions::default().with_unicode_sets_mode()), - (r"[\q{]", ParserOptions::default().with_unicode_sets_mode()), - (r"[\q{\a}]", ParserOptions::default().with_unicode_sets_mode()), + (r"[[\d-\D]]", UNICODE_SET_OPTIONS), + (r"[a&&b--c]", UNICODE_SET_OPTIONS), + (r"[a--b&&c]", UNICODE_SET_OPTIONS), + (r"[\q{]", UNICODE_SET_OPTIONS), + (r"[\q{\a}]", UNICODE_SET_OPTIONS), // TODO: ES2025 Duplicate named capturing groups - (r"(?..)|(?..)", ParserOptions::default()), // This will be valid - // (r"(?|(?))", ParserOptions::default()), // Nested, still invalid + (r"(?..)|(?..)", DEFAULT_OPTIONS), // This will be valid + // (r"(?|(?))", DEFAULT_OPTIONS), // Nested, still invalid ] { assert!( PatternParser::new(&allocator, source_text, *options).parse().is_err(), @@ -203,40 +201,36 @@ mod test { for (source_text, options, is_err) in &[ // No tests for 4,294,967,295 left parens - (r"(?..)(?..)", ParserOptions::default(), true), - (r"a{2,1}", ParserOptions::default(), true), - (r"(?)\k", ParserOptions::default(), true), - (r"()\2", ParserOptions::default().with_unicode_mode(), true), - (r"[a-\d]", ParserOptions::default().with_unicode_mode(), true), - (r"[\d-z]", ParserOptions::default().with_unicode_mode(), true), - (r"[\d-\d]", ParserOptions::default().with_unicode_mode(), true), - (r"[z-a]", ParserOptions::default(), true), - (r"\u{110000}", ParserOptions::default().with_unicode_mode(), true), - (r"(?<\uD800\uDBFF>)", ParserOptions::default(), true), - (r"\u{0}\u{110000}", ParserOptions::default().with_unicode_mode(), true), - (r"(?)", ParserOptions::default(), true), - (r"\p{Foo=Bar}", ParserOptions::default().with_unicode_mode(), true), - (r"\p{Foo}", ParserOptions::default().with_unicode_mode(), true), - (r"\p{Basic_Emoji}", ParserOptions::default().with_unicode_mode(), true), - (r"\P{Basic_Emoji}", ParserOptions::default().with_unicode_sets_mode(), true), - (r"[^\p{Basic_Emoji}]", ParserOptions::default().with_unicode_sets_mode(), true), - (r"[[^\p{Basic_Emoji}]]", ParserOptions::default().with_unicode_sets_mode(), true), - (r"[[^\q{}]]", ParserOptions::default().with_unicode_sets_mode(), true), - (r"[[^\q{ng}]]", ParserOptions::default().with_unicode_sets_mode(), true), - (r"[[^\q{a|}]]", ParserOptions::default().with_unicode_sets_mode(), true), - (r"[[^\q{ng}\q{o|k}]]", ParserOptions::default().with_unicode_sets_mode(), true), - (r"[[^\q{o|k}\q{ng}\q{o|k}]]", ParserOptions::default().with_unicode_sets_mode(), true), - (r"[[^\q{o|k}\q{o|k}\q{ng}]]", ParserOptions::default().with_unicode_sets_mode(), true), - (r"[[^\q{}&&\q{ng}]]", ParserOptions::default().with_unicode_sets_mode(), true), - (r"[[^\q{ng}&&\q{o|k}]]", ParserOptions::default().with_unicode_sets_mode(), false), - ( - r"[[^\q{ng}&&\q{o|k}&&\q{ng}]]", - ParserOptions::default().with_unicode_sets_mode(), - false, - ), - (r"[[^\q{ng}--\q{o|k}]]", ParserOptions::default().with_unicode_sets_mode(), true), - (r"[[^\q{o|k}--\q{ng}]]", ParserOptions::default().with_unicode_sets_mode(), false), - (r"[[z-a]]", ParserOptions::default().with_unicode_sets_mode(), true), + (r"(?..)(?..)", DEFAULT_OPTIONS, true), + (r"a{2,1}", DEFAULT_OPTIONS, true), + (r"(?)\k", DEFAULT_OPTIONS, true), + (r"()\2", UNICODE_OPTIONS, true), + (r"[a-\d]", UNICODE_OPTIONS, true), + (r"[\d-z]", UNICODE_OPTIONS, true), + (r"[\d-\d]", UNICODE_OPTIONS, true), + (r"[z-a]", DEFAULT_OPTIONS, true), + (r"\u{110000}", UNICODE_OPTIONS, true), + (r"(?<\uD800\uDBFF>)", DEFAULT_OPTIONS, true), + (r"\u{0}\u{110000}", UNICODE_OPTIONS, true), + (r"(?)", DEFAULT_OPTIONS, true), + (r"\p{Foo=Bar}", UNICODE_OPTIONS, true), + (r"\p{Foo}", UNICODE_OPTIONS, true), + (r"\p{Basic_Emoji}", UNICODE_OPTIONS, true), + (r"\P{Basic_Emoji}", UNICODE_SET_OPTIONS, true), + (r"[^\p{Basic_Emoji}]", UNICODE_SET_OPTIONS, true), + (r"[[^\p{Basic_Emoji}]]", UNICODE_SET_OPTIONS, true), + (r"[[^\q{}]]", UNICODE_SET_OPTIONS, true), + (r"[[^\q{ng}]]", UNICODE_SET_OPTIONS, true), + (r"[[^\q{a|}]]", UNICODE_SET_OPTIONS, true), + (r"[[^\q{ng}\q{o|k}]]", UNICODE_SET_OPTIONS, true), + (r"[[^\q{o|k}\q{ng}\q{o|k}]]", UNICODE_SET_OPTIONS, true), + (r"[[^\q{o|k}\q{o|k}\q{ng}]]", UNICODE_SET_OPTIONS, true), + (r"[[^\q{}&&\q{ng}]]", UNICODE_SET_OPTIONS, true), + (r"[[^\q{ng}&&\q{o|k}]]", UNICODE_SET_OPTIONS, false), + (r"[[^\q{ng}&&\q{o|k}&&\q{ng}]]", UNICODE_SET_OPTIONS, false), + (r"[[^\q{ng}--\q{o|k}]]", UNICODE_SET_OPTIONS, true), + (r"[[^\q{o|k}--\q{ng}]]", UNICODE_SET_OPTIONS, false), + (r"[[z-a]]", UNICODE_SET_OPTIONS, true), ] { assert_eq!( PatternParser::new(&allocator, source_text, *options).parse().is_err(), @@ -249,7 +243,7 @@ mod test { #[test] fn should_handle_empty() { let allocator = Allocator::default(); - let pattern = PatternParser::new(&allocator, "", ParserOptions::default()).parse().unwrap(); + let pattern = PatternParser::new(&allocator, "", DEFAULT_OPTIONS).parse().unwrap(); assert_eq!(pattern.body.body[0].body.len(), 1); } @@ -260,9 +254,9 @@ mod test { let source_text = "このEmoji🥹の数が変わる"; for (options, expected) in &[ - (ParserOptions::default(), 15), - (ParserOptions::default().with_unicode_mode(), 14), - (ParserOptions::default().with_unicode_sets_mode(), 14), + (DEFAULT_OPTIONS, 15), + (DEFAULT_OPTIONS.with_flags(RegularExpressionFlags::U), 14), + (DEFAULT_OPTIONS.with_flags(RegularExpressionFlags::V), 14), ] { let pattern = PatternParser::new(&allocator, source_text, *options).parse().unwrap(); assert_eq!(pattern.body.body[0].body.len(), *expected); diff --git a/crates/oxc_regular_expression/src/body_parser/parser.rs b/crates/oxc_regular_expression/src/body_parser/parser.rs index d23d5bdfbf092..cc9003e277a5b 100644 --- a/crates/oxc_regular_expression/src/body_parser/parser.rs +++ b/crates/oxc_regular_expression/src/body_parser/parser.rs @@ -3,10 +3,10 @@ use oxc_diagnostics::Result; use oxc_span::Atom as SpanAtom; use crate::{ - ast, + ast::{self, RegularExpressionFlags}, body_parser::{reader::Reader, state::State, unicode, unicode_property}, diagnostics, - options::ParserOptions, + options::PatternParserOptions, span::SpanFactory, surrogate_pair, }; @@ -20,7 +20,11 @@ pub struct PatternParser<'a> { } impl<'a> PatternParser<'a> { - pub fn new(allocator: &'a Allocator, source_text: &'a str, options: ParserOptions) -> Self { + pub fn new( + allocator: &'a Allocator, + source_text: &'a str, + options: PatternParserOptions, + ) -> Self { // `RegExp` can not be empty. // - Literal `//` means just a single line comment // - For `new RegExp("")` or `new RegExp()` (= empty), use a placeholder @@ -30,8 +34,11 @@ impl<'a> PatternParser<'a> { allocator, source_text, span_factory: SpanFactory::new(options.span_offset), - reader: Reader::new(source_text, options.unicode_mode), - state: State::new(options.unicode_mode, options.unicode_sets_mode), + reader: Reader::new( + source_text, + options.flags.intersects(RegularExpressionFlags::U | RegularExpressionFlags::V), + ), + state: State::new(options.flags), } } @@ -141,7 +148,7 @@ impl<'a> PatternParser<'a> { // [+UnicodeMode] Assertion // [+UnicodeMode] Atom Quantifier // [+UnicodeMode] Atom - if self.state.unicode_mode { + if self.state.unicode_mode() { if let Some(assertion) = self.parse_assertion()? { return Ok(Some(assertion)); } @@ -458,7 +465,7 @@ impl<'a> PatternParser<'a> { // DecimalEscape: \1 means indexed reference if let Some(index) = self.consume_decimal_escape()? { - if self.state.unicode_mode { + if self.state.unicode_mode() { // [SS:EE] AtomEscape :: DecimalEscape // It is a Syntax Error if the CapturingGroupNumber of DecimalEscape is strictly greater than CountLeftCapturingParensWithin(the Pattern containing AtomEscape). if self.state.num_of_capturing_groups < index { @@ -573,7 +580,7 @@ impl<'a> PatternParser<'a> { &mut self, span_start: usize, ) -> Result>> { - if !self.state.unicode_mode { + if !self.state.unicode_mode() { return Ok(None); } @@ -683,7 +690,7 @@ impl<'a> PatternParser<'a> { } // e.g. \u{1f600} - if let Some(cp) = self.consume_reg_exp_unicode_escape_sequence(self.state.unicode_mode)? { + if let Some(cp) = self.consume_reg_exp_unicode_escape_sequence(self.state.unicode_mode())? { return Ok(Some(ast::Character { span: self.span_factory.create(span_start, self.reader.offset()), kind: ast::CharacterKind::UnicodeEscape, @@ -692,7 +699,7 @@ impl<'a> PatternParser<'a> { } // e.g. \18 - if !self.state.unicode_mode { + if !self.state.unicode_mode() { if let Some(cp) = self.consume_legacy_octal_escape_sequence() { return Ok(Some(ast::Character { span: self.span_factory.create(span_start, self.reader.offset()), @@ -781,7 +788,7 @@ impl<'a> PatternParser<'a> { } // [+UnicodeSetsMode] ClassSetExpression - if self.state.unicode_sets_mode { + if self.state.flags.contains(RegularExpressionFlags::V) { return self.parse_class_set_expression(); } @@ -868,7 +875,7 @@ impl<'a> PatternParser<'a> { // [SS:EE] NonemptyClassRangesNoDash :: ClassAtomNoDash - ClassAtom ClassContents // It is a Syntax Error if IsCharacterClass of the first ClassAtom is true or IsCharacterClass of the second ClassAtom is true and this production has a [UnicodeMode] parameter. // (Annex B) - if self.state.unicode_mode { + if self.state.unicode_mode() { return Err(diagnostics::character_class_range_invalid_atom( self.span_factory.create(range_span_start, self.reader.offset()), )); @@ -976,7 +983,7 @@ impl<'a> PatternParser<'a> { } // [+UnicodeMode] - - if self.state.unicode_mode && self.reader.eat('-') { + if self.state.unicode_mode() && self.reader.eat('-') { return Ok(Some(ast::CharacterClassContents::Character(ast::Character { span: self.span_factory.create(span_start, self.reader.offset()), kind: ast::CharacterKind::Symbol, @@ -985,7 +992,7 @@ impl<'a> PatternParser<'a> { } // [~UnicodeMode] c ClassControlLetter - if !self.state.unicode_mode { + if !self.state.unicode_mode() { let checkpoint = self.reader.checkpoint(); if self.reader.eat('c') { @@ -1729,7 +1736,7 @@ impl<'a> PatternParser<'a> { // [SS:EE] UnicodePropertyValueExpression :: LoneUnicodePropertyNameOrValue // It is a Syntax Error if the enclosing Pattern does not have a [UnicodeSetsMode] parameter and the source text matched by LoneUnicodePropertyNameOrValue is a binary property of strings listed in the “Property name” column of Table 67. if unicode_property::is_valid_lone_unicode_property_of_strings(&name_or_value) { - if !self.state.unicode_sets_mode { + if !self.state.flags.contains(RegularExpressionFlags::V) { return Err(diagnostics::invalid_unicode_property_of_strings( self.span_factory.create(span_start, self.reader.offset()), name_or_value.as_str(), @@ -1849,7 +1856,7 @@ impl<'a> PatternParser<'a> { )); } - if !self.state.unicode_mode { + if !self.state.unicode_mode() { let span_start = self.reader.offset(); if let Some(lead_surrogate) = @@ -1910,7 +1917,7 @@ impl<'a> PatternParser<'a> { )); } - if !self.state.unicode_mode { + if !self.state.unicode_mode() { let span_start = self.reader.offset(); if let Some(lead_surrogate) = @@ -2019,7 +2026,7 @@ impl<'a> PatternParser<'a> { self.reader.rewind(checkpoint); } - if self.state.unicode_mode { + if self.state.unicode_mode() { return Err(diagnostics::invalid_unicode_escape_sequence( self.span_factory.create(span_start, self.reader.offset()), )); @@ -2094,7 +2101,7 @@ impl<'a> PatternParser<'a> { fn consume_identity_escape(&mut self) -> Option { let cp = self.reader.peek()?; - if self.state.unicode_mode { + if self.state.unicode_mode() { if unicode::is_syntax_character(cp) || cp == '/' as u32 { self.reader.advance(); return Some(cp); diff --git a/crates/oxc_regular_expression/src/body_parser/state.rs b/crates/oxc_regular_expression/src/body_parser/state.rs index 109c5acc3adf5..dc3e5fc61e37e 100644 --- a/crates/oxc_regular_expression/src/body_parser/state.rs +++ b/crates/oxc_regular_expression/src/body_parser/state.rs @@ -1,5 +1,7 @@ use rustc_hash::FxHashSet; +use crate::ast::RegularExpressionFlags; + use super::reader::Reader; /// Currently all of properties are read only from outside of this module. @@ -7,8 +9,7 @@ use super::reader::Reader; #[derive(Debug)] pub struct State<'a> { // Mode flags - pub unicode_mode: bool, - pub unicode_sets_mode: bool, + pub flags: RegularExpressionFlags, pub named_capture_groups: bool, // Other states pub num_of_capturing_groups: u32, @@ -16,10 +17,9 @@ pub struct State<'a> { } impl<'a> State<'a> { - pub fn new(unicode_mode: bool, unicode_sets_mode: bool) -> Self { + pub fn new(flags: RegularExpressionFlags) -> Self { Self { - unicode_mode, - unicode_sets_mode, + flags, named_capture_groups: false, num_of_capturing_groups: 0, capturing_group_names: FxHashSet::default(), @@ -37,14 +37,18 @@ impl<'a> State<'a> { // It is `true` // - if `u` or `v` flag is set // - or if `GroupName` is found in pattern - self.named_capture_groups = - self.unicode_mode || self.unicode_sets_mode || !capturing_group_names.is_empty(); + self.named_capture_groups = self.unicode_mode() || !capturing_group_names.is_empty(); self.num_of_capturing_groups = num_of_left_capturing_parens; self.capturing_group_names = capturing_group_names; duplicated_named_capturing_groups } + + #[inline] + pub fn unicode_mode(&self) -> bool { + self.flags.intersects(RegularExpressionFlags::U | RegularExpressionFlags::V) + } } /// Returns: (num_of_left_parens, capturing_group_names, duplicated_named_capturing_groups) diff --git a/crates/oxc_regular_expression/src/diagnostics.rs b/crates/oxc_regular_expression/src/diagnostics.rs index e0e6866d69e86..d2dd934af33b0 100644 --- a/crates/oxc_regular_expression/src/diagnostics.rs +++ b/crates/oxc_regular_expression/src/diagnostics.rs @@ -28,8 +28,8 @@ pub fn duplicated_flag(span: Span) -> OxcDiagnostic { } #[cold] -pub fn unknown_flag(span: Span) -> OxcDiagnostic { - OxcDiagnostic::error(format!("{PREFIX} Unknown flag")).with_label(span) +pub fn unknown_flag() -> OxcDiagnostic { + OxcDiagnostic::error(format!("{PREFIX} Unknown flag")) } #[cold] diff --git a/crates/oxc_regular_expression/src/display.rs b/crates/oxc_regular_expression/src/display.rs index 58c5ecc1fb9b6..26f8ad50e55ce 100644 --- a/crates/oxc_regular_expression/src/display.rs +++ b/crates/oxc_regular_expression/src/display.rs @@ -13,29 +13,26 @@ impl<'a> Display for RegularExpression<'a> { } } -impl Display for Flags { +impl Display for RegularExpressionFlags { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let mut flags = String::with_capacity(8); - macro_rules! if_true_append { + macro_rules! if_contains_write { ($flag:ident, $char:literal) => { - if self.$flag { - flags.push($char); + if self.contains(Self::$flag) { + write!(f, "{}", $char)?; } }; } - // write flags in the order they are described in the `MDN` - // - if_true_append!(has_indices, 'd'); - if_true_append!(global, 'g'); - if_true_append!(ignore_case, 'i'); - if_true_append!(multiline, 'm'); - if_true_append!(dot_all, 's'); - if_true_append!(unicode, 'u'); - if_true_append!(unicode_sets, 'v'); - if_true_append!(sticky, 'y'); - - write!(f, "{flags}") + if_contains_write!(G, 'g'); + if_contains_write!(I, 'i'); + if_contains_write!(M, 'm'); + if_contains_write!(S, 's'); + if_contains_write!(U, 'u'); + if_contains_write!(Y, 'y'); + if_contains_write!(D, 'd'); + if_contains_write!(V, 'v'); + + Ok(()) } } diff --git a/crates/oxc_regular_expression/src/flag_parser.rs b/crates/oxc_regular_expression/src/flag_parser.rs index 1c4b059ed32fc..970aa423acad2 100644 --- a/crates/oxc_regular_expression/src/flag_parser.rs +++ b/crates/oxc_regular_expression/src/flag_parser.rs @@ -1,12 +1,10 @@ use oxc_allocator::Allocator; -use oxc_diagnostics::Result; -use rustc_hash::FxHashSet; +use oxc_diagnostics::{OxcDiagnostic, Result}; -use crate::{ast, diagnostics, options::ParserOptions, span::SpanFactory}; +use crate::{ast::RegularExpressionFlags, diagnostics, options::ParserOptions, span::SpanFactory}; pub struct FlagsParser<'a> { source_text: &'a str, - // options: ParserOptions, span_factory: SpanFactory, } @@ -19,50 +17,59 @@ impl<'a> FlagsParser<'a> { } } - pub fn parse(&mut self) -> Result { - let span = self.span_factory.create(0, self.source_text.len()); - let mut global = false; - let mut ignore_case = false; - let mut multiline = false; - let mut unicode = false; - let mut sticky = false; - let mut dot_all = false; - let mut has_indices = false; - let mut unicode_sets = false; - - let mut existing_flags = FxHashSet::default(); - for (idx, c) in self.source_text.char_indices() { - if !existing_flags.insert(c) { + pub fn parse(&mut self) -> Result { + let mut flags = RegularExpressionFlags::empty(); + let mut idx = 0; + for c in self.source_text.chars() { + let flag = RegularExpressionFlags::try_from(c) + .map_err(|e| e.with_label(self.span_factory.create(idx, idx)))?; + if flags.contains(flag) { return Err(diagnostics::duplicated_flag(self.span_factory.create(idx, idx))); } + flags |= flag; + idx += 1; + } - match c { - 'g' => global = true, - 'i' => ignore_case = true, - 'm' => multiline = true, - 'u' => unicode = true, - 'y' => sticky = true, - 's' => dot_all = true, - 'd' => has_indices = true, - 'v' => unicode_sets = true, - _ => return Err(diagnostics::unknown_flag(self.span_factory.create(idx, idx))), - } + if flags.contains(RegularExpressionFlags::U | RegularExpressionFlags::V) { + return Err(diagnostics::invalid_unicode_flags(self.span_factory.create(0, idx))); } - if unicode && unicode_sets { - return Err(diagnostics::invalid_unicode_flags(span)); + Ok(flags) + } +} + +impl TryFrom for RegularExpressionFlags { + type Error = OxcDiagnostic; + + fn try_from(value: char) -> Result { + match value { + 'g' => Ok(Self::G), + 'i' => Ok(Self::I), + 'm' => Ok(Self::M), + 's' => Ok(Self::S), + 'u' => Ok(Self::U), + 'y' => Ok(Self::Y), + 'd' => Ok(Self::D), + 'v' => Ok(Self::V), + _ => Err(diagnostics::unknown_flag()), } + } +} - Ok(ast::Flags { - span, - global, - ignore_case, - multiline, - unicode, - sticky, - dot_all, - has_indices, - unicode_sets, - }) +impl TryFrom for RegularExpressionFlags { + type Error = u8; + + fn try_from(value: u8) -> std::result::Result { + match value { + b'g' => Ok(Self::G), + b'i' => Ok(Self::I), + b'm' => Ok(Self::M), + b's' => Ok(Self::S), + b'u' => Ok(Self::U), + b'y' => Ok(Self::Y), + b'd' => Ok(Self::D), + b'v' => Ok(Self::V), + _ => Err(value), + } } } diff --git a/crates/oxc_regular_expression/src/generated/derive_clone_in.rs b/crates/oxc_regular_expression/src/generated/derive_clone_in.rs index 6a3f9be434338..dbf63bbf26a81 100644 --- a/crates/oxc_regular_expression/src/generated/derive_clone_in.rs +++ b/crates/oxc_regular_expression/src/generated/derive_clone_in.rs @@ -19,23 +19,6 @@ impl<'old_alloc, 'new_alloc> CloneIn<'new_alloc> for RegularExpression<'old_allo } } -impl<'alloc> CloneIn<'alloc> for Flags { - type Cloned = Flags; - fn clone_in(&self, allocator: &'alloc Allocator) -> Self::Cloned { - Flags { - span: CloneIn::clone_in(&self.span, allocator), - global: CloneIn::clone_in(&self.global, allocator), - ignore_case: CloneIn::clone_in(&self.ignore_case, allocator), - multiline: CloneIn::clone_in(&self.multiline, allocator), - unicode: CloneIn::clone_in(&self.unicode, allocator), - sticky: CloneIn::clone_in(&self.sticky, allocator), - dot_all: CloneIn::clone_in(&self.dot_all, allocator), - has_indices: CloneIn::clone_in(&self.has_indices, allocator), - unicode_sets: CloneIn::clone_in(&self.unicode_sets, allocator), - } - } -} - impl<'old_alloc, 'new_alloc> CloneIn<'new_alloc> for Pattern<'old_alloc> { type Cloned = Pattern<'new_alloc>; fn clone_in(&self, allocator: &'new_alloc Allocator) -> Self::Cloned { diff --git a/crates/oxc_regular_expression/src/generated/derive_content_eq.rs b/crates/oxc_regular_expression/src/generated/derive_content_eq.rs index 1dd6c2d5cb928..d041a31e3b6e4 100644 --- a/crates/oxc_regular_expression/src/generated/derive_content_eq.rs +++ b/crates/oxc_regular_expression/src/generated/derive_content_eq.rs @@ -15,19 +15,6 @@ impl<'a> ContentEq for RegularExpression<'a> { } } -impl ContentEq for Flags { - fn content_eq(&self, other: &Self) -> bool { - ContentEq::content_eq(&self.global, &other.global) - && ContentEq::content_eq(&self.ignore_case, &other.ignore_case) - && ContentEq::content_eq(&self.multiline, &other.multiline) - && ContentEq::content_eq(&self.unicode, &other.unicode) - && ContentEq::content_eq(&self.sticky, &other.sticky) - && ContentEq::content_eq(&self.dot_all, &other.dot_all) - && ContentEq::content_eq(&self.has_indices, &other.has_indices) - && ContentEq::content_eq(&self.unicode_sets, &other.unicode_sets) - } -} - impl<'a> ContentEq for Pattern<'a> { fn content_eq(&self, other: &Self) -> bool { ContentEq::content_eq(&self.body, &other.body) diff --git a/crates/oxc_regular_expression/src/generated/derive_content_hash.rs b/crates/oxc_regular_expression/src/generated/derive_content_hash.rs index 5d3b722899039..231df68c71f45 100644 --- a/crates/oxc_regular_expression/src/generated/derive_content_hash.rs +++ b/crates/oxc_regular_expression/src/generated/derive_content_hash.rs @@ -17,19 +17,6 @@ impl<'a> ContentHash for RegularExpression<'a> { } } -impl ContentHash for Flags { - fn content_hash(&self, state: &mut H) { - ContentHash::content_hash(&self.global, state); - ContentHash::content_hash(&self.ignore_case, state); - ContentHash::content_hash(&self.multiline, state); - ContentHash::content_hash(&self.unicode, state); - ContentHash::content_hash(&self.sticky, state); - ContentHash::content_hash(&self.dot_all, state); - ContentHash::content_hash(&self.has_indices, state); - ContentHash::content_hash(&self.unicode_sets, state); - } -} - impl<'a> ContentHash for Pattern<'a> { fn content_hash(&self, state: &mut H) { ContentHash::content_hash(&self.body, state); diff --git a/crates/oxc_regular_expression/src/lib.rs b/crates/oxc_regular_expression/src/lib.rs index d1b19e075df96..7cad2a75465fe 100644 --- a/crates/oxc_regular_expression/src/lib.rs +++ b/crates/oxc_regular_expression/src/lib.rs @@ -17,6 +17,8 @@ mod generated { } pub use crate::{ - body_parser::PatternParser, flag_parser::FlagsParser, literal_parser::Parser, - options::ParserOptions, + body_parser::PatternParser, + flag_parser::FlagsParser, + literal_parser::Parser, + options::{ParserOptions, PatternParserOptions}, }; diff --git a/crates/oxc_regular_expression/src/literal_parser.rs b/crates/oxc_regular_expression/src/literal_parser.rs index 56a9e0eb1991b..ed6d8730519ec 100644 --- a/crates/oxc_regular_expression/src/literal_parser.rs +++ b/crates/oxc_regular_expression/src/literal_parser.rs @@ -3,7 +3,7 @@ use oxc_diagnostics::Result; use crate::{ ast, body_parser::PatternParser, diagnostics, flag_parser::FlagsParser, options::ParserOptions, - span::SpanFactory, + span::SpanFactory, PatternParserOptions, }; /// LiteralParser @@ -40,17 +40,15 @@ impl<'a> Parser<'a> { .parse()?; // Then parse the pattern with the flags - let pattern_options = match (flags.unicode, flags.unicode_sets) { - (true, false) => self.options.with_unicode_mode(), - (_, true) => self.options.with_unicode_sets_mode(), - _ => self.options, - }; let pattern = PatternParser::new( self.allocator, &self.source_text[body_start_offset..body_end_offset], - #[allow(clippy::cast_possible_truncation)] - pattern_options.with_span_offset(self.options.span_offset + body_start_offset as u32), + PatternParserOptions { + #[allow(clippy::cast_possible_truncation)] + span_offset: self.options.span_offset + body_start_offset as u32, + flags, + }, ) .parse()?; diff --git a/crates/oxc_regular_expression/src/options.rs b/crates/oxc_regular_expression/src/options.rs index a67d058129d81..19eed09434ec1 100644 --- a/crates/oxc_regular_expression/src/options.rs +++ b/crates/oxc_regular_expression/src/options.rs @@ -1,26 +1,34 @@ +use crate::ast::RegularExpressionFlags; + #[derive(Clone, Copy, Debug, Default)] pub struct ParserOptions { /// Used to adjust Span positions to fit the global source code. pub span_offset: u32, - /// Unicode mode(`u` or `v` flag) enabled or not. - pub unicode_mode: bool, - /// Extended Unicode mode(`v` flag) enabled or not. - pub unicode_sets_mode: bool, } impl ParserOptions { #[must_use] - pub fn with_span_offset(self, span_offset: u32) -> ParserOptions { - ParserOptions { span_offset, ..self } + pub fn with_span_offset(self, span_offset: u32) -> Self { + Self { span_offset } } +} + +#[derive(Clone, Copy, Debug, Default)] +pub struct PatternParserOptions { + /// Used to adjust Span positions to fit the global source code. + pub span_offset: u32, + /// Regular expression flags + pub flags: RegularExpressionFlags, +} +impl PatternParserOptions { #[must_use] - pub fn with_unicode_mode(self) -> ParserOptions { - ParserOptions { unicode_mode: true, ..self } + pub fn with_span_offset(self, span_offset: u32) -> Self { + Self { span_offset, ..self } } #[must_use] - pub fn with_unicode_sets_mode(self) -> ParserOptions { - ParserOptions { unicode_mode: true, unicode_sets_mode: true, ..self } + pub fn with_flags(self, flags: RegularExpressionFlags) -> Self { + Self { flags, ..self } } } diff --git a/crates/oxc_semantic/Cargo.toml b/crates/oxc_semantic/Cargo.toml index 67486f82dec79..05517c57da85d 100644 --- a/crates/oxc_semantic/Cargo.toml +++ b/crates/oxc_semantic/Cargo.toml @@ -19,13 +19,14 @@ workspace = true doctest = false [dependencies] -oxc_span = { workspace = true } -oxc_ast = { workspace = true } -oxc_syntax = { workspace = true } -oxc_cfg = { workspace = true } -oxc_diagnostics = { workspace = true } -oxc_index = { workspace = true } -oxc_allocator = { workspace = true } +oxc_span = { workspace = true } +oxc_ast = { workspace = true } +oxc_syntax = { workspace = true } +oxc_cfg = { workspace = true } +oxc_diagnostics = { workspace = true } +oxc_index = { workspace = true } +oxc_allocator = { workspace = true } +oxc_regular_expression = { workspace = true } assert-unchecked = { workspace = true } indexmap = { workspace = true } diff --git a/crates/oxc_semantic/src/checker/javascript.rs b/crates/oxc_semantic/src/checker/javascript.rs index ac92606ad3d10..9de5b8209704f 100644 --- a/crates/oxc_semantic/src/checker/javascript.rs +++ b/crates/oxc_semantic/src/checker/javascript.rs @@ -5,6 +5,7 @@ use oxc_ast::{ AstKind, }; use oxc_diagnostics::{LabeledSpan, OxcDiagnostic}; +use oxc_regular_expression::ast::RegularExpressionFlags; use oxc_span::{GetSpan, ModuleKind, Span}; use oxc_syntax::{ module_record::ExportLocalName, @@ -514,7 +515,7 @@ fn reg_exp_flag_u_and_v(span: Span) -> OxcDiagnostic { pub fn check_regexp_literal(lit: &RegExpLiteral, ctx: &SemanticBuilder<'_>) { let flags = lit.regex.flags; - if flags.contains(RegExpFlags::U | RegExpFlags::V) { + if flags.contains(RegularExpressionFlags::U | RegularExpressionFlags::V) { ctx.error(reg_exp_flag_u_and_v(lit.span)); } } diff --git a/crates/oxc_transformer/src/regexp/mod.rs b/crates/oxc_transformer/src/regexp/mod.rs index 53a992bfc555a..e6a85fc8bc823 100644 --- a/crates/oxc_transformer/src/regexp/mod.rs +++ b/crates/oxc_transformer/src/regexp/mod.rs @@ -49,7 +49,8 @@ use std::borrow::Cow; use oxc_ast::ast::*; use oxc_diagnostics::Result; use oxc_regular_expression::ast::{ - CharacterClass, CharacterClassContents, LookAroundAssertionKind, Pattern, Term, + CharacterClass, CharacterClassContents, LookAroundAssertionKind, Pattern, + RegularExpressionFlags, Term, }; use oxc_semantic::ReferenceFlags; use oxc_span::{Atom, SPAN}; @@ -62,7 +63,7 @@ pub use options::RegExpOptions; pub struct RegExp<'a> { ctx: Ctx<'a>, - unsupported_flags: RegExpFlags, + unsupported_flags: RegularExpressionFlags, some_unsupported_patterns: bool, look_behind_assertions: bool, named_capture_groups: bool, @@ -72,21 +73,21 @@ pub struct RegExp<'a> { impl<'a> RegExp<'a> { pub fn new(options: RegExpOptions, ctx: Ctx<'a>) -> Self { // Get unsupported flags - let mut unsupported_flags = RegExpFlags::empty(); + let mut unsupported_flags = RegularExpressionFlags::empty(); if options.dot_all_flag { - unsupported_flags |= RegExpFlags::S; + unsupported_flags |= RegularExpressionFlags::S; } if options.sticky_flag { - unsupported_flags |= RegExpFlags::Y; + unsupported_flags |= RegularExpressionFlags::Y; } if options.unicode_flag { - unsupported_flags |= RegExpFlags::U; + unsupported_flags |= RegularExpressionFlags::U; } if options.match_indices { - unsupported_flags |= RegExpFlags::D; + unsupported_flags |= RegularExpressionFlags::D; } if options.set_notation { - unsupported_flags |= RegExpFlags::V; + unsupported_flags |= RegularExpressionFlags::V; } // Get if some unsupported patterns @@ -243,15 +244,14 @@ fn character_class_has_unicode_property_escape(character_class: &CharacterClass) fn try_parse_pattern<'a>( raw: &'a str, span: Span, - flags: RegExpFlags, + flags: RegularExpressionFlags, ctx: &mut TraverseCtx<'a>, ) -> Result> { - use oxc_regular_expression::{ParserOptions, PatternParser}; + use oxc_regular_expression::{PatternParser, PatternParserOptions}; - let options = ParserOptions { + let options = PatternParserOptions { span_offset: span.start + 1, // exclude `/` - unicode_mode: flags.contains(RegExpFlags::U) || flags.contains(RegExpFlags::V), - unicode_sets_mode: flags.contains(RegExpFlags::V), + flags, }; PatternParser::new(ctx.ast.allocator, raw, options).parse() } diff --git a/tasks/ast_tools/src/passes/calc_layout.rs b/tasks/ast_tools/src/passes/calc_layout.rs index e66d6fc427ad2..bc4063c98fa56 100644 --- a/tasks/ast_tools/src/passes/calc_layout.rs +++ b/tasks/ast_tools/src/passes/calc_layout.rs @@ -359,6 +359,6 @@ lazy_static! { // Unsupported: this is a `bitflags` generated type, we don't expand macros ReferenceFlags: { _ => Layout::known(1, 1, 0), }, // Unsupported: this is a `bitflags` generated type, we don't expand macros - RegExpFlags: { _ => Layout::known(1, 1, 0), }, + RegularExpressionFlags: { _ => Layout::known(1, 1, 0), }, }; }