From b0dc8b50310880f8c73ea8de7a51513fc6070d84 Mon Sep 17 00:00:00 2001 From: Yuji Sugiura Date: Mon, 26 Aug 2024 12:56:36 +0900 Subject: [PATCH 1/3] Remove u16 reader, mark surrogate pairs instead --- crates/oxc_regular_expression/src/ast.rs | 6 +- .../src/body_parser/mod.rs | 54 ++++- .../src/body_parser/parser.rs | 20 +- .../src/body_parser/reader.rs | 216 ++++++------------ .../src/body_parser/state.rs | 2 +- .../src/body_parser/unicode.rs | 4 + 6 files changed, 144 insertions(+), 158 deletions(-) diff --git a/crates/oxc_regular_expression/src/ast.rs b/crates/oxc_regular_expression/src/ast.rs index 76669ef92d746..23f1e59a1bf84 100644 --- a/crates/oxc_regular_expression/src/ast.rs +++ b/crates/oxc_regular_expression/src/ast.rs @@ -108,14 +108,12 @@ pub struct Quantifier<'a> { /// Single character. #[derive(Debug, Copy, Clone)] pub struct Character { - /// This will be invalid position when `UnicodeMode` is disabled and `value` is a surrogate pair. pub span: Span, pub kind: CharacterKind, - /// Unicode code point or UTF-16 code unit. pub value: u32, } -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, Eq, PartialEq)] pub enum CharacterKind { ControlLetter, HexadecimalEscape, @@ -124,6 +122,8 @@ pub enum CharacterKind { Octal, SingleEscape, Symbol, + /// In non `UnicodeMode`, some `Symbol` is marked as `SurrogatePairs`. + SurrogatePairs, UnicodeEscape, } diff --git a/crates/oxc_regular_expression/src/body_parser/mod.rs b/crates/oxc_regular_expression/src/body_parser/mod.rs index e6f3804e1ef66..258825b8a0d11 100644 --- a/crates/oxc_regular_expression/src/body_parser/mod.rs +++ b/crates/oxc_regular_expression/src/body_parser/mod.rs @@ -8,7 +8,7 @@ pub use parser::PatternParser; #[cfg(test)] mod test { - use crate::{ParserOptions, PatternParser}; + use crate::{ast, ParserOptions, PatternParser}; use oxc_allocator::Allocator; #[test] @@ -243,17 +243,57 @@ mod test { } #[test] - fn should_handle_unicode() { + fn should_handle_surrogate_pairs() { let allocator = Allocator::default(); - let source_text = "このEmoji🥹の数が変わる"; + let source_text = "🚀, 𐍈, 𝄞, and so on..."; for (options, expected) in &[ - (ParserOptions::default(), 15), - (ParserOptions::default().with_unicode_mode(), 14), - (ParserOptions::default().with_unicode_sets_mode(), 14), + (ParserOptions::default(), 3), + (ParserOptions::default().with_unicode_mode(), 0), + (ParserOptions::default().with_unicode_sets_mode(), 0), ] { let pattern = PatternParser::new(&allocator, source_text, *options).parse().unwrap(); - assert_eq!(pattern.body.body[0].body.len(), *expected); + + let alternative = &pattern.body.body[0]; + assert_eq!( + alternative + .body + .iter() + .filter(|term| { + if let ast::Term::Character(ch) = term { + return ch.kind == ast::CharacterKind::SurrogatePairs; + } + false + }) + .count(), + *expected + ); + } + + let source_text = "[🥹🚀𝄞]"; + for (options, expected) in &[ + (ParserOptions::default(), 3), + (ParserOptions::default().with_unicode_mode(), 0), + (ParserOptions::default().with_unicode_sets_mode(), 0), + ] { + let pattern = PatternParser::new(&allocator, source_text, *options).parse().unwrap(); + + let ast::Term::CharacterClass(character_class) = &pattern.body.body[0].body[0] else { + panic!("Expected character class"); + }; + assert_eq!( + character_class + .body + .iter() + .filter(|ccc| { + if let ast::CharacterClassContents::Character(ch) = ccc { + return ch.kind == ast::CharacterKind::SurrogatePairs; + } + false + }) + .count(), + *expected + ); } } } diff --git a/crates/oxc_regular_expression/src/body_parser/parser.rs b/crates/oxc_regular_expression/src/body_parser/parser.rs index 19ce876beb4f1..548d9920bc9c5 100644 --- a/crates/oxc_regular_expression/src/body_parser/parser.rs +++ b/crates/oxc_regular_expression/src/body_parser/parser.rs @@ -28,7 +28,7 @@ impl<'a> PatternParser<'a> { allocator, source_text, span_factory: SpanFactory::new(options.span_offset), - reader: Reader::new(source_text, options.unicode_mode), + reader: Reader::new(source_text), state: State::new(options.unicode_mode, options.unicode_sets_mode), } } @@ -316,7 +316,11 @@ impl<'a> PatternParser<'a> { return Ok(Some(ast::Term::Character(ast::Character { span: self.span_factory.create(span_start, self.reader.offset()), - kind: ast::CharacterKind::Symbol, + kind: if self.state.unicode_mode || unicode::is_bmp(cp) { + ast::CharacterKind::Symbol + } else { + ast::CharacterKind::SurrogatePairs + }, value: cp, }))); } @@ -438,7 +442,11 @@ impl<'a> PatternParser<'a> { if let Some(cp) = self.consume_extended_pattern_character() { return Ok(Some(ast::Term::Character(ast::Character { span: self.span_factory.create(span_start, self.reader.offset()), - kind: ast::CharacterKind::Symbol, + kind: if self.state.unicode_mode || unicode::is_bmp(cp) { + ast::CharacterKind::Symbol + } else { + ast::CharacterKind::SurrogatePairs + }, value: cp, }))); } @@ -926,7 +934,11 @@ impl<'a> PatternParser<'a> { return Ok(Some(ast::CharacterClassContents::Character(ast::Character { span: self.span_factory.create(span_start, self.reader.offset()), - kind: ast::CharacterKind::Symbol, + kind: if self.state.unicode_mode || unicode::is_bmp(cp) { + ast::CharacterKind::Symbol + } else { + ast::CharacterKind::SurrogatePairs + }, value: cp, }))); } diff --git a/crates/oxc_regular_expression/src/body_parser/reader.rs b/crates/oxc_regular_expression/src/body_parser/reader.rs index f066bdceaa121..7b754adbeebf4 100644 --- a/crates/oxc_regular_expression/src/body_parser/reader.rs +++ b/crates/oxc_regular_expression/src/body_parser/reader.rs @@ -1,66 +1,34 @@ pub struct Reader<'a> { - source: &'a str, - unicode_mode: bool, - /// Current index for `u8_units`(unicode mode) or `u16_units`(non-unicode mode). + source_text: &'a str, + /// Current index for `units`. index: usize, - // NOTE: Distinguish these 2 units looks cleaner, but it may not be necessary. - // - // If I understand correctly (and there are no unexpected factors), - // AST `Character[kind=Symbol]` only needs to be aware of this for surrogate pairs. - // - // Therefore, performance might be improved by: - // - using only `u8_units`, and - // - checking if each unit (char) is non-BMP, and if so, converting it into a surrogate pair and emitting 2 units. - // However, I'm not certain this approach is faster than current one using `encode_utf16()` all at once. - /// Iteration units for unicode mode. - /// Even in non-unicode mode, used for `Span` offset calculation. - u8_units: Vec<(usize, char)>, - /// Iteration units for non-unicode mode. - u16_units: Vec, - /// Last offset caches for non-unicode mode. - last_offset_indices: (usize, usize), + /// Iteration units. + /// We use `char` and its index regardless of the unicode mode or non-unicode mode. + /// However, in non-unicode mode, the parser may split the surrogate pairs, which may become the AST. + units: Vec<(usize, char)>, } impl<'a> Reader<'a> { - pub fn new(source: &'a str, unicode_mode: bool) -> Self { + pub fn new(source_text: &'a str) -> Self { // NOTE: Collecting `Vec` may not be efficient if the source is too large. // Implements lookahead cache with `VecDeque` is better...? // But when I tried once, there are no notable improvements. - let u8_units = source.char_indices().collect::>(); - let u16_units = if unicode_mode { "" } else { source }.encode_utf16().collect::>(); + let units = source_text.char_indices().collect::>(); - Self { source, unicode_mode, index: 0, u8_units, u16_units, last_offset_indices: (0, 0) } + Self { source_text, index: 0, units } } pub fn offset(&mut self) -> usize { - if self.unicode_mode { - self.u8_units.get(self.index).map_or(self.source.len(), |(idx, _)| *idx) - } else { - // NOTE: This does not return valid `Span` offset for surrogate pairs. - // In the first place, there is no such thing as string slice corresponding to them... - let (mut u16_idx, mut u8_idx) = self.last_offset_indices; - for (idx, ch) in &self.u8_units[u8_idx..] { - if self.index <= u16_idx { - self.last_offset_indices = (u16_idx, u8_idx); - return *idx; - } - - u16_idx += ch.len_utf16(); - u8_idx += 1; - } - self.source.len() - } + self.units.get(self.index).map_or(self.source_text.len(), |(idx, _)| *idx) } // NOTE: For now, `usize` is enough for the checkpoint. - // But `last_offset_indices` should be stored as well for more performance? pub fn checkpoint(&self) -> usize { self.index } pub fn rewind(&mut self, checkpoint: usize) { self.index = checkpoint; - self.last_offset_indices = (0, 0); } pub fn advance(&mut self) { @@ -69,13 +37,7 @@ impl<'a> Reader<'a> { fn peek_nth(&self, n: usize) -> Option { let nth = self.index + n; - - if self.unicode_mode { - self.u8_units.get(nth).map(|&(_, ch)| ch as u32) - } else { - #[allow(clippy::cast_lossless)] - self.u16_units.get(nth).map(|&cu| cu as u32) - } + self.units.get(nth).map(|&(_, ch)| ch as u32) } pub fn peek(&self) -> Option { @@ -139,121 +101,89 @@ mod test { #[test] fn index_basic() { let source_text = "/RegExp✨/i"; - let unicode_reader = Reader::new(source_text, true); - let legacy_reader = Reader::new(source_text, false); + let mut reader = Reader::new(source_text); - for mut reader in [unicode_reader, legacy_reader] { - assert_eq!(reader.index, 0); - assert_eq!(reader.peek(), Some('/' as u32)); + assert_eq!(reader.index, 0); + assert_eq!(reader.peek(), Some('/' as u32)); - reader.advance(); - assert_eq!(reader.index, 1); - assert_eq!(reader.peek(), Some('R' as u32)); - assert_eq!(reader.peek2(), Some('e' as u32)); - - assert!(reader.eat('R')); - assert!(!reader.eat('R')); - assert!(reader.eat('e')); - assert!(reader.eat('g')); - assert!(reader.eat('E')); - assert!(!reader.eat3('E', 'x', 'p')); - assert!(reader.eat2('x', 'p')); - - let checkpoint = reader.checkpoint(); - assert_eq!(checkpoint, 7); - assert_eq!(reader.peek(), Some('✨' as u32)); + reader.advance(); + assert_eq!(reader.index, 1); + assert_eq!(reader.peek(), Some('R' as u32)); + assert_eq!(reader.peek2(), Some('e' as u32)); - reader.advance(); - reader.advance(); - assert_eq!(reader.peek(), Some('i' as u32)); + assert!(reader.eat('R')); + assert!(!reader.eat('R')); + assert!(reader.eat('e')); + assert!(reader.eat('g')); + assert!(reader.eat('E')); + assert!(!reader.eat3('E', 'x', 'p')); + assert!(reader.eat2('x', 'p')); - reader.advance(); - assert_eq!(reader.peek(), None); + let checkpoint = reader.checkpoint(); + assert_eq!(checkpoint, 7); + assert_eq!(reader.peek(), Some('✨' as u32)); - reader.rewind(checkpoint); - assert_eq!(reader.peek(), Some('✨' as u32)); - } + reader.advance(); + reader.advance(); + assert_eq!(reader.peek(), Some('i' as u32)); + + reader.advance(); + assert_eq!(reader.peek(), None); + + reader.rewind(checkpoint); + assert_eq!(reader.peek(), Some('✨' as u32)); } #[test] fn index_unicode() { let source_text = "𠮷野家は👈🏻あっち"; + let mut reader = Reader::new(source_text); - let mut unicode_reader = Reader::new(source_text, true); - - assert!(unicode_reader.eat('𠮷')); // Can eat - assert!(unicode_reader.eat2('野', '家')); - let checkpoint = unicode_reader.checkpoint(); - assert!(unicode_reader.eat('は')); + assert!(reader.eat('𠮷')); // Can eat + assert!(reader.eat2('野', '家')); + let checkpoint = reader.checkpoint(); + assert!(reader.eat('は')); // Emoji + Skin tone - unicode_reader.advance(); - unicode_reader.advance(); - - assert!(unicode_reader.eat('あ')); - assert_eq!(unicode_reader.peek(), Some('っ' as u32)); - assert_eq!(unicode_reader.peek2(), Some('ち' as u32)); - - unicode_reader.rewind(checkpoint); - assert!(unicode_reader.eat('は')); + reader.advance(); + reader.advance(); - let mut legacy_reader = Reader::new(source_text, false); + assert!(reader.eat('あ')); + assert_eq!(reader.peek(), Some('っ' as u32)); + assert_eq!(reader.peek2(), Some('ち' as u32)); - assert!(!legacy_reader.eat('𠮷')); // Can not eat - legacy_reader.advance(); - assert!(!legacy_reader.eat('𠮷')); // Also can not - legacy_reader.advance(); - - assert!(legacy_reader.eat('野')); - assert!(legacy_reader.eat('家')); - let checkpoint = unicode_reader.checkpoint(); - assert!(legacy_reader.eat('は')); - - legacy_reader.advance(); - legacy_reader.advance(); - legacy_reader.advance(); - legacy_reader.advance(); - - assert_eq!(legacy_reader.peek(), Some('あ' as u32)); - assert_eq!(legacy_reader.peek2(), Some('っ' as u32)); - assert!(legacy_reader.eat3('あ', 'っ', 'ち')); - - legacy_reader.rewind(checkpoint); - assert!(legacy_reader.eat('は')); + reader.rewind(checkpoint); + assert!(reader.eat('は')); } #[test] fn span_position() { let source_text = "^ Catch😎 @ symbols🇺🇳 $"; + let mut reader = Reader::new(source_text); + + while reader.peek() != Some('^' as u32) { + reader.advance(); + } + let s1 = reader.offset(); + assert!(reader.eat('^')); + let e1 = reader.offset(); + + while reader.peek() != Some('@' as u32) { + reader.advance(); + } + let s2 = reader.offset(); + assert!(reader.eat('@')); + let e2 = reader.offset(); - let unicode_reader = Reader::new(source_text, true); - let legacy_reader = Reader::new(source_text, false); - - for mut reader in [unicode_reader, legacy_reader] { - while reader.peek() != Some('^' as u32) { - reader.advance(); - } - let s1 = reader.offset(); - assert!(reader.eat('^')); - let e1 = reader.offset(); - - while reader.peek() != Some('@' as u32) { - reader.advance(); - } - let s2 = reader.offset(); - assert!(reader.eat('@')); - let e2 = reader.offset(); - - while reader.peek() != Some('$' as u32) { - reader.advance(); - } - let s3 = reader.offset(); - assert!(reader.eat('$')); - let e3 = reader.offset(); - - assert_eq!(&source_text[s1..e1], "^"); - assert_eq!(&source_text[s2..e2], "@"); - assert_eq!(&source_text[s3..e3], "$"); + while reader.peek() != Some('$' as u32) { + reader.advance(); } + let s3 = reader.offset(); + assert!(reader.eat('$')); + let e3 = reader.offset(); + + assert_eq!(&source_text[s1..e1], "^"); + assert_eq!(&source_text[s2..e2], "@"); + assert_eq!(&source_text[s3..e3], "$"); } } diff --git a/crates/oxc_regular_expression/src/body_parser/state.rs b/crates/oxc_regular_expression/src/body_parser/state.rs index 109c5acc3adf5..ad73e81ebb89c 100644 --- a/crates/oxc_regular_expression/src/body_parser/state.rs +++ b/crates/oxc_regular_expression/src/body_parser/state.rs @@ -53,7 +53,7 @@ fn parse_capturing_groups(source_text: &str) -> (u32, FxHashSet<&str>, Vec<(usiz let mut capturing_group_names = FxHashSet::default(); let mut duplicated_named_capturing_groups = vec![]; - let mut reader = Reader::new(source_text, true); + let mut reader = Reader::new(source_text); let mut in_escape = false; let mut in_character_class = false; diff --git a/crates/oxc_regular_expression/src/body_parser/unicode.rs b/crates/oxc_regular_expression/src/body_parser/unicode.rs index 7a4249b4734f2..504bcd27a6650 100644 --- a/crates/oxc_regular_expression/src/body_parser/unicode.rs +++ b/crates/oxc_regular_expression/src/body_parser/unicode.rs @@ -126,6 +126,10 @@ pub fn combine_surrogate_pair(lead: u32, trail: u32) -> u32 { (lead - 0xd800) * 0x400 + trail - 0xdc00 + 0x10000 } +pub fn is_bmp(cp: u32) -> bool { + cp <= 0xffff +} + pub fn map_control_escape(cp: u32) -> Option { match char::from_u32(cp) { Some('f') => Some(0x0c), From ea42174166e2c5869a870a936190bbebb4ab8c8b Mon Sep 17 00:00:00 2001 From: Yuji Sugiura Date: Mon, 26 Aug 2024 13:19:18 +0900 Subject: [PATCH 2/3] Update comment --- crates/oxc_regular_expression/src/body_parser/reader.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crates/oxc_regular_expression/src/body_parser/reader.rs b/crates/oxc_regular_expression/src/body_parser/reader.rs index 7b754adbeebf4..fe20ea97cc67e 100644 --- a/crates/oxc_regular_expression/src/body_parser/reader.rs +++ b/crates/oxc_regular_expression/src/body_parser/reader.rs @@ -4,7 +4,7 @@ pub struct Reader<'a> { index: usize, /// Iteration units. /// We use `char` and its index regardless of the unicode mode or non-unicode mode. - /// However, in non-unicode mode, the parser may split the surrogate pairs, which may become the AST. + /// However, in non-unicode mode, the parser may mark it as the surrogate pairs. units: Vec<(usize, char)>, } @@ -35,6 +35,8 @@ impl<'a> Reader<'a> { self.index += 1; } + // We can iterate over `char` and parse the pattern. + // But we need a code point, not a char, for AST results. fn peek_nth(&self, n: usize) -> Option { let nth = self.index + n; self.units.get(nth).map(|&(_, ch)| ch as u32) From 6debca24623be2a0830e29402708117df6e197d5 Mon Sep 17 00:00:00 2001 From: Yuji Sugiura Date: Mon, 26 Aug 2024 13:22:57 +0900 Subject: [PATCH 3/3] Update snapshot --- tasks/coverage/parser_test262.snap | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tasks/coverage/parser_test262.snap b/tasks/coverage/parser_test262.snap index 27d28db939ed2..1b305bd00c714 100644 --- a/tasks/coverage/parser_test262.snap +++ b/tasks/coverage/parser_test262.snap @@ -20839,11 +20839,11 @@ Negative Passed: 4220/4220 (100.00%) · ─ ╰──── - × Invalid regular expression: Invalid surrogate pair - ╭─[test262/test/language/literals/regexp/named-groups/invalid-non-id-start-groupspecifier-2.js:15:5] + × Invalid regular expression: Unterminated capturing group name + ╭─[test262/test/language/literals/regexp/named-groups/invalid-non-id-start-groupspecifier-2.js:15:4] 14 │ 15 │ /(?<𐒤>a)/; - · ─ + · ─ ╰──── × Invalid regular expression: Invalid unicode escape sequence