From b0dc8b50310880f8c73ea8de7a51513fc6070d84 Mon Sep 17 00:00:00 2001
From: Yuji Sugiura <y.sugiura.0316@gmail.com>
Date: Mon, 26 Aug 2024 12:56:36 +0900
Subject: [PATCH 1/3] Remove u16 reader, mark surrogate pairs instead

---
 crates/oxc_regular_expression/src/ast.rs      |   6 +-
 .../src/body_parser/mod.rs                    |  54 ++++-
 .../src/body_parser/parser.rs                 |  20 +-
 .../src/body_parser/reader.rs                 | 216 ++++++------------
 .../src/body_parser/state.rs                  |   2 +-
 .../src/body_parser/unicode.rs                |   4 +
 6 files changed, 144 insertions(+), 158 deletions(-)

diff --git a/crates/oxc_regular_expression/src/ast.rs b/crates/oxc_regular_expression/src/ast.rs
index 76669ef92d746..23f1e59a1bf84 100644
--- a/crates/oxc_regular_expression/src/ast.rs
+++ b/crates/oxc_regular_expression/src/ast.rs
@@ -108,14 +108,12 @@ pub struct Quantifier<'a> {
 /// Single character.
 #[derive(Debug, Copy, Clone)]
 pub struct Character {
-    /// This will be invalid position when `UnicodeMode` is disabled and `value` is a surrogate pair.
     pub span: Span,
     pub kind: CharacterKind,
-    /// Unicode code point or UTF-16 code unit.
     pub value: u32,
 }
 
-#[derive(Debug, Copy, Clone)]
+#[derive(Debug, Copy, Clone, Eq, PartialEq)]
 pub enum CharacterKind {
     ControlLetter,
     HexadecimalEscape,
@@ -124,6 +122,8 @@ pub enum CharacterKind {
     Octal,
     SingleEscape,
     Symbol,
+    /// In non `UnicodeMode`, some `Symbol` is marked as `SurrogatePairs`.
+    SurrogatePairs,
     UnicodeEscape,
 }
 
diff --git a/crates/oxc_regular_expression/src/body_parser/mod.rs b/crates/oxc_regular_expression/src/body_parser/mod.rs
index e6f3804e1ef66..258825b8a0d11 100644
--- a/crates/oxc_regular_expression/src/body_parser/mod.rs
+++ b/crates/oxc_regular_expression/src/body_parser/mod.rs
@@ -8,7 +8,7 @@ pub use parser::PatternParser;
 
 #[cfg(test)]
 mod test {
-    use crate::{ParserOptions, PatternParser};
+    use crate::{ast, ParserOptions, PatternParser};
     use oxc_allocator::Allocator;
 
     #[test]
@@ -243,17 +243,57 @@ mod test {
     }
 
     #[test]
-    fn should_handle_unicode() {
+    fn should_handle_surrogate_pairs() {
         let allocator = Allocator::default();
-        let source_text = "このEmoji🥹の数が変わる";
 
+        let source_text = "🚀, 𐍈, 𝄞, and so on...";
         for (options, expected) in &[
-            (ParserOptions::default(), 15),
-            (ParserOptions::default().with_unicode_mode(), 14),
-            (ParserOptions::default().with_unicode_sets_mode(), 14),
+            (ParserOptions::default(), 3),
+            (ParserOptions::default().with_unicode_mode(), 0),
+            (ParserOptions::default().with_unicode_sets_mode(), 0),
         ] {
             let pattern = PatternParser::new(&allocator, source_text, *options).parse().unwrap();
-            assert_eq!(pattern.body.body[0].body.len(), *expected);
+
+            let alternative = &pattern.body.body[0];
+            assert_eq!(
+                alternative
+                    .body
+                    .iter()
+                    .filter(|term| {
+                        if let ast::Term::Character(ch) = term {
+                            return ch.kind == ast::CharacterKind::SurrogatePairs;
+                        }
+                        false
+                    })
+                    .count(),
+                *expected
+            );
+        }
+
+        let source_text = "[🥹🚀𝄞]";
+        for (options, expected) in &[
+            (ParserOptions::default(), 3),
+            (ParserOptions::default().with_unicode_mode(), 0),
+            (ParserOptions::default().with_unicode_sets_mode(), 0),
+        ] {
+            let pattern = PatternParser::new(&allocator, source_text, *options).parse().unwrap();
+
+            let ast::Term::CharacterClass(character_class) = &pattern.body.body[0].body[0] else {
+                panic!("Expected character class");
+            };
+            assert_eq!(
+                character_class
+                    .body
+                    .iter()
+                    .filter(|ccc| {
+                        if let ast::CharacterClassContents::Character(ch) = ccc {
+                            return ch.kind == ast::CharacterKind::SurrogatePairs;
+                        }
+                        false
+                    })
+                    .count(),
+                *expected
+            );
         }
     }
 }
diff --git a/crates/oxc_regular_expression/src/body_parser/parser.rs b/crates/oxc_regular_expression/src/body_parser/parser.rs
index 19ce876beb4f1..548d9920bc9c5 100644
--- a/crates/oxc_regular_expression/src/body_parser/parser.rs
+++ b/crates/oxc_regular_expression/src/body_parser/parser.rs
@@ -28,7 +28,7 @@ impl<'a> PatternParser<'a> {
             allocator,
             source_text,
             span_factory: SpanFactory::new(options.span_offset),
-            reader: Reader::new(source_text, options.unicode_mode),
+            reader: Reader::new(source_text),
             state: State::new(options.unicode_mode, options.unicode_sets_mode),
         }
     }
@@ -316,7 +316,11 @@ impl<'a> PatternParser<'a> {
 
             return Ok(Some(ast::Term::Character(ast::Character {
                 span: self.span_factory.create(span_start, self.reader.offset()),
-                kind: ast::CharacterKind::Symbol,
+                kind: if self.state.unicode_mode || unicode::is_bmp(cp) {
+                    ast::CharacterKind::Symbol
+                } else {
+                    ast::CharacterKind::SurrogatePairs
+                },
                 value: cp,
             })));
         }
@@ -438,7 +442,11 @@ impl<'a> PatternParser<'a> {
         if let Some(cp) = self.consume_extended_pattern_character() {
             return Ok(Some(ast::Term::Character(ast::Character {
                 span: self.span_factory.create(span_start, self.reader.offset()),
-                kind: ast::CharacterKind::Symbol,
+                kind: if self.state.unicode_mode || unicode::is_bmp(cp) {
+                    ast::CharacterKind::Symbol
+                } else {
+                    ast::CharacterKind::SurrogatePairs
+                },
                 value: cp,
             })));
         }
@@ -926,7 +934,11 @@ impl<'a> PatternParser<'a> {
 
             return Ok(Some(ast::CharacterClassContents::Character(ast::Character {
                 span: self.span_factory.create(span_start, self.reader.offset()),
-                kind: ast::CharacterKind::Symbol,
+                kind: if self.state.unicode_mode || unicode::is_bmp(cp) {
+                    ast::CharacterKind::Symbol
+                } else {
+                    ast::CharacterKind::SurrogatePairs
+                },
                 value: cp,
             })));
         }
diff --git a/crates/oxc_regular_expression/src/body_parser/reader.rs b/crates/oxc_regular_expression/src/body_parser/reader.rs
index f066bdceaa121..7b754adbeebf4 100644
--- a/crates/oxc_regular_expression/src/body_parser/reader.rs
+++ b/crates/oxc_regular_expression/src/body_parser/reader.rs
@@ -1,66 +1,34 @@
 pub struct Reader<'a> {
-    source: &'a str,
-    unicode_mode: bool,
-    /// Current index for `u8_units`(unicode mode) or `u16_units`(non-unicode mode).
+    source_text: &'a str,
+    /// Current index for `units`.
     index: usize,
-    // NOTE: Distinguish these 2 units looks cleaner, but it may not be necessary.
-    //
-    // If I understand correctly (and there are no unexpected factors),
-    // AST `Character[kind=Symbol]` only needs to be aware of this for surrogate pairs.
-    //
-    // Therefore, performance might be improved by:
-    // - using only `u8_units`, and
-    // - checking if each unit (char) is non-BMP, and if so, converting it into a surrogate pair and emitting 2 units.
-    // However, I'm not certain this approach is faster than current one using `encode_utf16()` all at once.
-    /// Iteration units for unicode mode.
-    /// Even in non-unicode mode, used for `Span` offset calculation.
-    u8_units: Vec<(usize, char)>,
-    /// Iteration units for non-unicode mode.
-    u16_units: Vec<u16>,
-    /// Last offset caches for non-unicode mode.
-    last_offset_indices: (usize, usize),
+    /// Iteration units.
+    /// We use `char` and its index regardless of the unicode mode or non-unicode mode.
+    /// However, in non-unicode mode, the parser may split the surrogate pairs, which may become the AST.
+    units: Vec<(usize, char)>,
 }
 
 impl<'a> Reader<'a> {
-    pub fn new(source: &'a str, unicode_mode: bool) -> Self {
+    pub fn new(source_text: &'a str) -> Self {
         // NOTE: Collecting `Vec` may not be efficient if the source is too large.
         // Implements lookahead cache with `VecDeque` is better...?
         // But when I tried once, there are no notable improvements.
-        let u8_units = source.char_indices().collect::<Vec<_>>();
-        let u16_units = if unicode_mode { "" } else { source }.encode_utf16().collect::<Vec<_>>();
+        let units = source_text.char_indices().collect::<Vec<_>>();
 
-        Self { source, unicode_mode, index: 0, u8_units, u16_units, last_offset_indices: (0, 0) }
+        Self { source_text, index: 0, units }
     }
 
     pub fn offset(&mut self) -> usize {
-        if self.unicode_mode {
-            self.u8_units.get(self.index).map_or(self.source.len(), |(idx, _)| *idx)
-        } else {
-            // NOTE: This does not return valid `Span` offset for surrogate pairs.
-            // In the first place, there is no such thing as string slice corresponding to them...
-            let (mut u16_idx, mut u8_idx) = self.last_offset_indices;
-            for (idx, ch) in &self.u8_units[u8_idx..] {
-                if self.index <= u16_idx {
-                    self.last_offset_indices = (u16_idx, u8_idx);
-                    return *idx;
-                }
-
-                u16_idx += ch.len_utf16();
-                u8_idx += 1;
-            }
-            self.source.len()
-        }
+        self.units.get(self.index).map_or(self.source_text.len(), |(idx, _)| *idx)
     }
 
     // NOTE: For now, `usize` is enough for the checkpoint.
-    // But `last_offset_indices` should be stored as well for more performance?
     pub fn checkpoint(&self) -> usize {
         self.index
     }
 
     pub fn rewind(&mut self, checkpoint: usize) {
         self.index = checkpoint;
-        self.last_offset_indices = (0, 0);
     }
 
     pub fn advance(&mut self) {
@@ -69,13 +37,7 @@ impl<'a> Reader<'a> {
 
     fn peek_nth(&self, n: usize) -> Option<u32> {
         let nth = self.index + n;
-
-        if self.unicode_mode {
-            self.u8_units.get(nth).map(|&(_, ch)| ch as u32)
-        } else {
-            #[allow(clippy::cast_lossless)]
-            self.u16_units.get(nth).map(|&cu| cu as u32)
-        }
+        self.units.get(nth).map(|&(_, ch)| ch as u32)
     }
 
     pub fn peek(&self) -> Option<u32> {
@@ -139,121 +101,89 @@ mod test {
     #[test]
     fn index_basic() {
         let source_text = "/RegExp✨/i";
-        let unicode_reader = Reader::new(source_text, true);
-        let legacy_reader = Reader::new(source_text, false);
+        let mut reader = Reader::new(source_text);
 
-        for mut reader in [unicode_reader, legacy_reader] {
-            assert_eq!(reader.index, 0);
-            assert_eq!(reader.peek(), Some('/' as u32));
+        assert_eq!(reader.index, 0);
+        assert_eq!(reader.peek(), Some('/' as u32));
 
-            reader.advance();
-            assert_eq!(reader.index, 1);
-            assert_eq!(reader.peek(), Some('R' as u32));
-            assert_eq!(reader.peek2(), Some('e' as u32));
-
-            assert!(reader.eat('R'));
-            assert!(!reader.eat('R'));
-            assert!(reader.eat('e'));
-            assert!(reader.eat('g'));
-            assert!(reader.eat('E'));
-            assert!(!reader.eat3('E', 'x', 'p'));
-            assert!(reader.eat2('x', 'p'));
-
-            let checkpoint = reader.checkpoint();
-            assert_eq!(checkpoint, 7);
-            assert_eq!(reader.peek(), Some('✨' as u32));
+        reader.advance();
+        assert_eq!(reader.index, 1);
+        assert_eq!(reader.peek(), Some('R' as u32));
+        assert_eq!(reader.peek2(), Some('e' as u32));
 
-            reader.advance();
-            reader.advance();
-            assert_eq!(reader.peek(), Some('i' as u32));
+        assert!(reader.eat('R'));
+        assert!(!reader.eat('R'));
+        assert!(reader.eat('e'));
+        assert!(reader.eat('g'));
+        assert!(reader.eat('E'));
+        assert!(!reader.eat3('E', 'x', 'p'));
+        assert!(reader.eat2('x', 'p'));
 
-            reader.advance();
-            assert_eq!(reader.peek(), None);
+        let checkpoint = reader.checkpoint();
+        assert_eq!(checkpoint, 7);
+        assert_eq!(reader.peek(), Some('✨' as u32));
 
-            reader.rewind(checkpoint);
-            assert_eq!(reader.peek(), Some('✨' as u32));
-        }
+        reader.advance();
+        reader.advance();
+        assert_eq!(reader.peek(), Some('i' as u32));
+
+        reader.advance();
+        assert_eq!(reader.peek(), None);
+
+        reader.rewind(checkpoint);
+        assert_eq!(reader.peek(), Some('✨' as u32));
     }
 
     #[test]
     fn index_unicode() {
         let source_text = "𠮷野家は👈🏻あっち";
+        let mut reader = Reader::new(source_text);
 
-        let mut unicode_reader = Reader::new(source_text, true);
-
-        assert!(unicode_reader.eat('𠮷')); // Can eat
-        assert!(unicode_reader.eat2('野', '家'));
-        let checkpoint = unicode_reader.checkpoint();
-        assert!(unicode_reader.eat('は'));
+        assert!(reader.eat('𠮷')); // Can eat
+        assert!(reader.eat2('野', '家'));
+        let checkpoint = reader.checkpoint();
+        assert!(reader.eat('は'));
 
         // Emoji + Skin tone
-        unicode_reader.advance();
-        unicode_reader.advance();
-
-        assert!(unicode_reader.eat('あ'));
-        assert_eq!(unicode_reader.peek(), Some('っ' as u32));
-        assert_eq!(unicode_reader.peek2(), Some('ち' as u32));
-
-        unicode_reader.rewind(checkpoint);
-        assert!(unicode_reader.eat('は'));
+        reader.advance();
+        reader.advance();
 
-        let mut legacy_reader = Reader::new(source_text, false);
+        assert!(reader.eat('あ'));
+        assert_eq!(reader.peek(), Some('っ' as u32));
+        assert_eq!(reader.peek2(), Some('ち' as u32));
 
-        assert!(!legacy_reader.eat('𠮷')); // Can not eat
-        legacy_reader.advance();
-        assert!(!legacy_reader.eat('𠮷')); // Also can not
-        legacy_reader.advance();
-
-        assert!(legacy_reader.eat('野'));
-        assert!(legacy_reader.eat('家'));
-        let checkpoint = unicode_reader.checkpoint();
-        assert!(legacy_reader.eat('は'));
-
-        legacy_reader.advance();
-        legacy_reader.advance();
-        legacy_reader.advance();
-        legacy_reader.advance();
-
-        assert_eq!(legacy_reader.peek(), Some('あ' as u32));
-        assert_eq!(legacy_reader.peek2(), Some('っ' as u32));
-        assert!(legacy_reader.eat3('あ', 'っ', 'ち'));
-
-        legacy_reader.rewind(checkpoint);
-        assert!(legacy_reader.eat('は'));
+        reader.rewind(checkpoint);
+        assert!(reader.eat('は'));
     }
 
     #[test]
     fn span_position() {
         let source_text = "^ Catch😎 @ symbols🇺🇳 $";
+        let mut reader = Reader::new(source_text);
+
+        while reader.peek() != Some('^' as u32) {
+            reader.advance();
+        }
+        let s1 = reader.offset();
+        assert!(reader.eat('^'));
+        let e1 = reader.offset();
+
+        while reader.peek() != Some('@' as u32) {
+            reader.advance();
+        }
+        let s2 = reader.offset();
+        assert!(reader.eat('@'));
+        let e2 = reader.offset();
 
-        let unicode_reader = Reader::new(source_text, true);
-        let legacy_reader = Reader::new(source_text, false);
-
-        for mut reader in [unicode_reader, legacy_reader] {
-            while reader.peek() != Some('^' as u32) {
-                reader.advance();
-            }
-            let s1 = reader.offset();
-            assert!(reader.eat('^'));
-            let e1 = reader.offset();
-
-            while reader.peek() != Some('@' as u32) {
-                reader.advance();
-            }
-            let s2 = reader.offset();
-            assert!(reader.eat('@'));
-            let e2 = reader.offset();
-
-            while reader.peek() != Some('$' as u32) {
-                reader.advance();
-            }
-            let s3 = reader.offset();
-            assert!(reader.eat('$'));
-            let e3 = reader.offset();
-
-            assert_eq!(&source_text[s1..e1], "^");
-            assert_eq!(&source_text[s2..e2], "@");
-            assert_eq!(&source_text[s3..e3], "$");
+        while reader.peek() != Some('$' as u32) {
+            reader.advance();
         }
+        let s3 = reader.offset();
+        assert!(reader.eat('$'));
+        let e3 = reader.offset();
+
+        assert_eq!(&source_text[s1..e1], "^");
+        assert_eq!(&source_text[s2..e2], "@");
+        assert_eq!(&source_text[s3..e3], "$");
     }
 }
diff --git a/crates/oxc_regular_expression/src/body_parser/state.rs b/crates/oxc_regular_expression/src/body_parser/state.rs
index 109c5acc3adf5..ad73e81ebb89c 100644
--- a/crates/oxc_regular_expression/src/body_parser/state.rs
+++ b/crates/oxc_regular_expression/src/body_parser/state.rs
@@ -53,7 +53,7 @@ fn parse_capturing_groups(source_text: &str) -> (u32, FxHashSet<&str>, Vec<(usiz
     let mut capturing_group_names = FxHashSet::default();
     let mut duplicated_named_capturing_groups = vec![];
 
-    let mut reader = Reader::new(source_text, true);
+    let mut reader = Reader::new(source_text);
 
     let mut in_escape = false;
     let mut in_character_class = false;
diff --git a/crates/oxc_regular_expression/src/body_parser/unicode.rs b/crates/oxc_regular_expression/src/body_parser/unicode.rs
index 7a4249b4734f2..504bcd27a6650 100644
--- a/crates/oxc_regular_expression/src/body_parser/unicode.rs
+++ b/crates/oxc_regular_expression/src/body_parser/unicode.rs
@@ -126,6 +126,10 @@ pub fn combine_surrogate_pair(lead: u32, trail: u32) -> u32 {
     (lead - 0xd800) * 0x400 + trail - 0xdc00 + 0x10000
 }
 
+pub fn is_bmp(cp: u32) -> bool {
+    cp <= 0xffff
+}
+
 pub fn map_control_escape(cp: u32) -> Option<u32> {
     match char::from_u32(cp) {
         Some('f') => Some(0x0c),

From ea42174166e2c5869a870a936190bbebb4ab8c8b Mon Sep 17 00:00:00 2001
From: Yuji Sugiura <y.sugiura.0316@gmail.com>
Date: Mon, 26 Aug 2024 13:19:18 +0900
Subject: [PATCH 2/3] Update comment

---
 crates/oxc_regular_expression/src/body_parser/reader.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/crates/oxc_regular_expression/src/body_parser/reader.rs b/crates/oxc_regular_expression/src/body_parser/reader.rs
index 7b754adbeebf4..fe20ea97cc67e 100644
--- a/crates/oxc_regular_expression/src/body_parser/reader.rs
+++ b/crates/oxc_regular_expression/src/body_parser/reader.rs
@@ -4,7 +4,7 @@ pub struct Reader<'a> {
     index: usize,
     /// Iteration units.
     /// We use `char` and its index regardless of the unicode mode or non-unicode mode.
-    /// However, in non-unicode mode, the parser may split the surrogate pairs, which may become the AST.
+    /// However, in non-unicode mode, the parser may mark it as the surrogate pairs.
     units: Vec<(usize, char)>,
 }
 
@@ -35,6 +35,8 @@ impl<'a> Reader<'a> {
         self.index += 1;
     }
 
+    // We can iterate over `char` and parse the pattern.
+    // But we need a code point, not a char, for AST results.
     fn peek_nth(&self, n: usize) -> Option<u32> {
         let nth = self.index + n;
         self.units.get(nth).map(|&(_, ch)| ch as u32)

From 6debca24623be2a0830e29402708117df6e197d5 Mon Sep 17 00:00:00 2001
From: Yuji Sugiura <y.sugiura.0316@gmail.com>
Date: Mon, 26 Aug 2024 13:22:57 +0900
Subject: [PATCH 3/3] Update snapshot

---
 tasks/coverage/parser_test262.snap | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tasks/coverage/parser_test262.snap b/tasks/coverage/parser_test262.snap
index 27d28db939ed2..1b305bd00c714 100644
--- a/tasks/coverage/parser_test262.snap
+++ b/tasks/coverage/parser_test262.snap
@@ -20839,11 +20839,11 @@ Negative Passed: 4220/4220 (100.00%)
     ·    ─
     ╰────
 
-  × Invalid regular expression: Invalid surrogate pair
-    ╭─[test262/test/language/literals/regexp/named-groups/invalid-non-id-start-groupspecifier-2.js:15:5]
+  × Invalid regular expression: Unterminated capturing group name
+    ╭─[test262/test/language/literals/regexp/named-groups/invalid-non-id-start-groupspecifier-2.js:15:4]
  14 │ 
  15 │ /(?<𐒤>a)/;
-    ·     ─
+    ·    ─
     ╰────
 
   × Invalid regular expression: Invalid unicode escape sequence