Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions crates/oxc_regular_expression/src/ast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -108,14 +108,12 @@ pub struct Quantifier<'a> {
/// Single character.
#[derive(Debug, Copy, Clone)]
pub struct Character {
/// This will be invalid position when `UnicodeMode` is disabled and `value` is a surrogate pair.
pub span: Span,
pub kind: CharacterKind,
/// Unicode code point or UTF-16 code unit.
pub value: u32,
}

#[derive(Debug, Copy, Clone)]
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub enum CharacterKind {
ControlLetter,
HexadecimalEscape,
Expand All @@ -124,6 +122,8 @@ pub enum CharacterKind {
Octal,
SingleEscape,
Symbol,
/// In non `UnicodeMode`, some `Symbol` is marked as `SurrogatePairs`.
SurrogatePairs,
UnicodeEscape,
}

Expand Down
54 changes: 47 additions & 7 deletions crates/oxc_regular_expression/src/body_parser/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ pub use parser::PatternParser;

#[cfg(test)]
mod test {
use crate::{ParserOptions, PatternParser};
use crate::{ast, ParserOptions, PatternParser};
use oxc_allocator::Allocator;

#[test]
Expand Down Expand Up @@ -243,17 +243,57 @@ mod test {
}

#[test]
fn should_handle_unicode() {
fn should_handle_surrogate_pairs() {
let allocator = Allocator::default();
let source_text = "このEmoji🥹の数が変わる";

let source_text = "🚀, 𐍈, 𝄞, and so on...";
for (options, expected) in &[
(ParserOptions::default(), 15),
(ParserOptions::default().with_unicode_mode(), 14),
(ParserOptions::default().with_unicode_sets_mode(), 14),
(ParserOptions::default(), 3),
(ParserOptions::default().with_unicode_mode(), 0),
(ParserOptions::default().with_unicode_sets_mode(), 0),
] {
let pattern = PatternParser::new(&allocator, source_text, *options).parse().unwrap();
assert_eq!(pattern.body.body[0].body.len(), *expected);

let alternative = &pattern.body.body[0];
assert_eq!(
alternative
.body
.iter()
.filter(|term| {
if let ast::Term::Character(ch) = term {
return ch.kind == ast::CharacterKind::SurrogatePairs;
}
false
})
.count(),
*expected
);
}

let source_text = "[🥹🚀𝄞]";
for (options, expected) in &[
(ParserOptions::default(), 3),
(ParserOptions::default().with_unicode_mode(), 0),
(ParserOptions::default().with_unicode_sets_mode(), 0),
] {
let pattern = PatternParser::new(&allocator, source_text, *options).parse().unwrap();

let ast::Term::CharacterClass(character_class) = &pattern.body.body[0].body[0] else {
panic!("Expected character class");
};
assert_eq!(
character_class
.body
.iter()
.filter(|ccc| {
if let ast::CharacterClassContents::Character(ch) = ccc {
return ch.kind == ast::CharacterKind::SurrogatePairs;
}
false
})
.count(),
*expected
);
}
}
}
20 changes: 16 additions & 4 deletions crates/oxc_regular_expression/src/body_parser/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ impl<'a> PatternParser<'a> {
allocator,
source_text,
span_factory: SpanFactory::new(options.span_offset),
reader: Reader::new(source_text, options.unicode_mode),
reader: Reader::new(source_text),
state: State::new(options.unicode_mode, options.unicode_sets_mode),
}
}
Expand Down Expand Up @@ -316,7 +316,11 @@ impl<'a> PatternParser<'a> {

return Ok(Some(ast::Term::Character(ast::Character {
span: self.span_factory.create(span_start, self.reader.offset()),
kind: ast::CharacterKind::Symbol,
kind: if self.state.unicode_mode || unicode::is_bmp(cp) {
ast::CharacterKind::Symbol
} else {
ast::CharacterKind::SurrogatePairs
},
value: cp,
})));
}
Expand Down Expand Up @@ -438,7 +442,11 @@ impl<'a> PatternParser<'a> {
if let Some(cp) = self.consume_extended_pattern_character() {
return Ok(Some(ast::Term::Character(ast::Character {
span: self.span_factory.create(span_start, self.reader.offset()),
kind: ast::CharacterKind::Symbol,
kind: if self.state.unicode_mode || unicode::is_bmp(cp) {
ast::CharacterKind::Symbol
} else {
ast::CharacterKind::SurrogatePairs
},
value: cp,
})));
}
Expand Down Expand Up @@ -926,7 +934,11 @@ impl<'a> PatternParser<'a> {

return Ok(Some(ast::CharacterClassContents::Character(ast::Character {
span: self.span_factory.create(span_start, self.reader.offset()),
kind: ast::CharacterKind::Symbol,
kind: if self.state.unicode_mode || unicode::is_bmp(cp) {
ast::CharacterKind::Symbol
} else {
ast::CharacterKind::SurrogatePairs
},
value: cp,
})));
}
Expand Down
Loading