diff --git a/crates/oxc_ast/src/ast/js.rs b/crates/oxc_ast/src/ast/js.rs index 3048bf4bd4c35..10d0ed77da9e4 100644 --- a/crates/oxc_ast/src/ast/js.rs +++ b/crates/oxc_ast/src/ast/js.rs @@ -461,8 +461,17 @@ pub struct TaggedTemplateExpression<'a> { #[generate_derive(CloneIn, Dummy, TakeIn, GetSpan, GetSpanMut, ContentEq, ESTree)] pub struct TemplateElement<'a> { pub span: Span, + #[estree(via = TemplateElementValue)] pub value: TemplateElementValue<'a>, pub tail: bool, + /// The template element contains lone surrogates. + /// + /// `value.cooked` is encoded using `\u{FFFD}` (the lossy replacement character) as an escape character. + /// Lone surrogates are encoded as `\u{FFFD}XXXX`, where `XXXX` is the code unit in hex. + /// The lossy escape character itself is encoded as `\u{FFFD}fffd`. + #[builder(default)] + #[estree(skip)] + pub lone_surrogates: bool, } /// See [template-strings-cooked-vs-raw](https://exploringjs.com/js/book/ch_template-literals.html#template-strings-cooked-vs-raw) diff --git a/crates/oxc_ast/src/generated/assert_layouts.rs b/crates/oxc_ast/src/generated/assert_layouts.rs index 192583465e212..7ba5e76b6ebeb 100644 --- a/crates/oxc_ast/src/generated/assert_layouts.rs +++ b/crates/oxc_ast/src/generated/assert_layouts.rs @@ -105,6 +105,7 @@ const _: () = { assert!(offset_of!(TemplateElement, span) == 0); assert!(offset_of!(TemplateElement, value) == 8); assert!(offset_of!(TemplateElement, tail) == 40); + assert!(offset_of!(TemplateElement, lone_surrogates) == 41); assert!(size_of::() == 32); assert!(align_of::() == 8); @@ -1496,6 +1497,7 @@ const _: () = { assert!(offset_of!(TemplateElement, span) == 0); assert!(offset_of!(TemplateElement, value) == 8); assert!(offset_of!(TemplateElement, tail) == 24); + assert!(offset_of!(TemplateElement, lone_surrogates) == 25); assert!(size_of::() == 16); assert!(align_of::() == 4); diff --git a/crates/oxc_ast/src/generated/ast_builder.rs b/crates/oxc_ast/src/generated/ast_builder.rs index 61c88f90b02dc..0a925d9982d0c 100644 --- a/crates/oxc_ast/src/generated/ast_builder.rs +++ b/crates/oxc_ast/src/generated/ast_builder.rs @@ -1900,7 +1900,7 @@ impl<'a> AstBuilder<'a> { value: TemplateElementValue<'a>, tail: bool, ) -> TemplateElement<'a> { - TemplateElement { span, value, tail } + TemplateElement { span, value, tail, lone_surrogates: Default::default() } } /// Build a [`TemplateElement`], and store it in the memory arena. @@ -1921,6 +1921,49 @@ impl<'a> AstBuilder<'a> { Box::new_in(self.template_element(span, value, tail), self.allocator) } + /// Build a [`TemplateElement`] with `lone_surrogates`. + /// + /// If you want the built node to be allocated in the memory arena, use [`AstBuilder::alloc_template_element_with_lone_surrogates`] instead. + /// + /// ## Parameters + /// * `span`: The [`Span`] covering this node + /// * `value` + /// * `tail` + /// * `lone_surrogates`: The template element contains lone surrogates. + #[inline] + pub fn template_element_with_lone_surrogates( + self, + span: Span, + value: TemplateElementValue<'a>, + tail: bool, + lone_surrogates: bool, + ) -> TemplateElement<'a> { + TemplateElement { span, value, tail, lone_surrogates } + } + + /// Build a [`TemplateElement`] with `lone_surrogates`, and store it in the memory arena. + /// + /// Returns a [`Box`] containing the newly-allocated node. If you want a stack-allocated node, use [`AstBuilder::template_element_with_lone_surrogates`] instead. + /// + /// ## Parameters + /// * `span`: The [`Span`] covering this node + /// * `value` + /// * `tail` + /// * `lone_surrogates`: The template element contains lone surrogates. + #[inline] + pub fn alloc_template_element_with_lone_surrogates( + self, + span: Span, + value: TemplateElementValue<'a>, + tail: bool, + lone_surrogates: bool, + ) -> Box<'a, TemplateElement<'a>> { + Box::new_in( + self.template_element_with_lone_surrogates(span, value, tail, lone_surrogates), + self.allocator, + ) + } + /// Build a [`MemberExpression::ComputedMemberExpression`]. /// /// This node contains a [`ComputedMemberExpression`] that will be stored in the memory arena. diff --git a/crates/oxc_ast/src/generated/derive_clone_in.rs b/crates/oxc_ast/src/generated/derive_clone_in.rs index fe6b4d8a92d79..7cf0975696ed1 100644 --- a/crates/oxc_ast/src/generated/derive_clone_in.rs +++ b/crates/oxc_ast/src/generated/derive_clone_in.rs @@ -1097,6 +1097,7 @@ impl<'new_alloc> CloneIn<'new_alloc> for TemplateElement<'_> { span: CloneIn::clone_in(&self.span, allocator), value: CloneIn::clone_in(&self.value, allocator), tail: CloneIn::clone_in(&self.tail, allocator), + lone_surrogates: CloneIn::clone_in(&self.lone_surrogates, allocator), } } @@ -1105,6 +1106,7 @@ impl<'new_alloc> CloneIn<'new_alloc> for TemplateElement<'_> { span: CloneIn::clone_in_with_semantic_ids(&self.span, allocator), value: CloneIn::clone_in_with_semantic_ids(&self.value, allocator), tail: CloneIn::clone_in_with_semantic_ids(&self.tail, allocator), + lone_surrogates: CloneIn::clone_in_with_semantic_ids(&self.lone_surrogates, allocator), } } } diff --git a/crates/oxc_ast/src/generated/derive_content_eq.rs b/crates/oxc_ast/src/generated/derive_content_eq.rs index e9cb8d3c237f8..fc452e485e1f6 100644 --- a/crates/oxc_ast/src/generated/derive_content_eq.rs +++ b/crates/oxc_ast/src/generated/derive_content_eq.rs @@ -291,6 +291,7 @@ impl ContentEq for TemplateElement<'_> { fn content_eq(&self, other: &Self) -> bool { ContentEq::content_eq(&self.value, &other.value) && ContentEq::content_eq(&self.tail, &other.tail) + && ContentEq::content_eq(&self.lone_surrogates, &other.lone_surrogates) } } diff --git a/crates/oxc_ast/src/generated/derive_dummy.rs b/crates/oxc_ast/src/generated/derive_dummy.rs index 3c3e7960d3285..873546c5fa3e4 100644 --- a/crates/oxc_ast/src/generated/derive_dummy.rs +++ b/crates/oxc_ast/src/generated/derive_dummy.rs @@ -215,6 +215,7 @@ impl<'a> Dummy<'a> for TemplateElement<'a> { span: Dummy::dummy(allocator), value: Dummy::dummy(allocator), tail: Dummy::dummy(allocator), + lone_surrogates: Dummy::dummy(allocator), } } } diff --git a/crates/oxc_ast/src/generated/derive_estree.rs b/crates/oxc_ast/src/generated/derive_estree.rs index 4fcaefea53025..017065b1b8956 100644 --- a/crates/oxc_ast/src/generated/derive_estree.rs +++ b/crates/oxc_ast/src/generated/derive_estree.rs @@ -342,7 +342,7 @@ impl ESTree for TemplateElement<'_> { state.serialize_field("type", &JsonSafeString("TemplateElement")); state.serialize_field("start", &self.span.start); state.serialize_field("end", &self.span.end); - state.serialize_field("value", &self.value); + state.serialize_field("value", &crate::serialize::TemplateElementValue(self)); state.serialize_field("tail", &self.tail); state.end(); } diff --git a/crates/oxc_ast/src/serialize.rs b/crates/oxc_ast/src/serialize.rs index 7183491d077e1..c3891c4ce74b8 100644 --- a/crates/oxc_ast/src/serialize.rs +++ b/crates/oxc_ast/src/serialize.rs @@ -357,6 +357,52 @@ impl ESTree for RegExpFlagsConverter<'_> { } } +/// Serializer for `value` field of `TemplateElement`. +/// +/// Handle when `lone_surrogates` flag is set, indicating the cooked string contains lone surrogates. +#[ast_meta] +#[estree( + ts_type = "TemplateElementValue", + raw_deser = r#" + let value = DESER[TemplateElementValue](POS_OFFSET.value); + if (value.cooked !== null && DESER[bool](POS_OFFSET.lone_surrogates)) { + value.cooked = value.cooked + .replace(/\uFFFD(.{4})/g, (_, hex) => String.fromCodePoint(parseInt(hex, 16))); + } + value + "# +)] +pub struct TemplateElementValue<'a, 'b>(pub &'b TemplateElement<'a>); + +impl ESTree for TemplateElementValue<'_, '_> { + fn serialize(&self, serializer: S) { + let element = self.0; + #[expect(clippy::if_not_else)] + if !element.lone_surrogates { + element.value.serialize(serializer); + } else { + // String contains lone surrogates + self.serialize_lone_surrogates(serializer); + } + } +} + +impl TemplateElementValue<'_, '_> { + #[cold] + #[inline(never)] + fn serialize_lone_surrogates(&self, serializer: S) { + let value = &self.0.value; + + let mut state = serializer.serialize_struct(); + state.serialize_field("raw", &value.raw); + + let cooked = value.cooked.as_ref().map(|cooked| LoneSurrogatesString(cooked.as_str())); + state.serialize_field("cooked", &cooked); + + state.end(); + } +} + // -------------------- // Various // -------------------- diff --git a/crates/oxc_parser/src/js/expression.rs b/crates/oxc_parser/src/js/expression.rs index e157274a129d6..f4998c9c9a778 100644 --- a/crates/oxc_parser/src/js/expression.rs +++ b/crates/oxc_parser/src/js/expression.rs @@ -540,6 +540,7 @@ impl<'a> ParserImpl<'a> { // `cooked = None` when template literal has invalid escape sequence // This is matched by `is_valid_escape_sequence` in `Lexer::read_template_literal` let cooked = self.cur_template_string(); + let lone_surrogates = self.cur_token().lone_surrogates; let cur_src = self.cur_src(); let raw = &cur_src[1..cur_src.len() - end_offset as usize]; @@ -560,10 +561,11 @@ impl<'a> ParserImpl<'a> { } let tail = matches!(cur_kind, Kind::TemplateTail | Kind::NoSubstitutionTemplate); - self.ast.template_element( + self.ast.template_element_with_lone_surrogates( span, TemplateElementValue { raw, cooked: cooked.map(Atom::from) }, tail, + lone_surrogates, ) } diff --git a/crates/oxc_parser/src/lexer/template.rs b/crates/oxc_parser/src/lexer/template.rs index 2c76d67ad87fe..c43f22a28e175 100644 --- a/crates/oxc_parser/src/lexer/template.rs +++ b/crates/oxc_parser/src/lexer/template.rs @@ -1,4 +1,4 @@ -use std::cmp::max; +use std::{cmp::max, str}; use oxc_allocator::String; @@ -11,9 +11,26 @@ use super::{ const MIN_ESCAPED_TEMPLATE_LIT_LEN: usize = 16; +/// Convert `char` to UTF-8 bytes array. +const fn to_bytes(ch: char) -> [u8; N] { + let mut bytes = [0u8; N]; + ch.encode_utf8(&mut bytes); + bytes +} + +/// Lossy replacement character (U+FFFD) as UTF-8 bytes. +const LOSSY_REPLACEMENT_CHAR_BYTES: [u8; 3] = to_bytes('\u{FFFD}'); +const LOSSY_REPLACEMENT_CHAR_FIRST_BYTE: u8 = LOSSY_REPLACEMENT_CHAR_BYTES[0]; +const _: () = assert!(LOSSY_REPLACEMENT_CHAR_FIRST_BYTE == 0xEF); + static TEMPLATE_LITERAL_TABLE: SafeByteMatchTable = safe_byte_match_table!(|b| matches!(b, b'$' | b'`' | b'\r' | b'\\')); +// Same as above, but with 1st byte of lossy replacement character added +static TEMPLATE_LITERAL_ESCAPED_MATCH_TABLE: SafeByteMatchTable = safe_byte_match_table!( + |b| matches!(b, b'$' | b'`' | b'\r' | b'\\' | LOSSY_REPLACEMENT_CHAR_FIRST_BYTE) +); + /// 12.8.6 Template Literal Lexical Components impl<'a> Lexer<'a> { /// Read template literal component. @@ -206,7 +223,7 @@ impl<'a> Lexer<'a> { byte_search! { lexer: self, - table: TEMPLATE_LITERAL_TABLE, + table: TEMPLATE_LITERAL_ESCAPED_MATCH_TABLE, start: pos, continue_if: (next_byte, pos) { if next_byte == b'$' { @@ -238,7 +255,8 @@ impl<'a> Lexer<'a> { cold_branch(|| true) } } else { - // Next byte is '`', `\r` or `\`. Add chunk up to before this char to `str`. + // Next byte is '`', `\r`, `\`, or first byte of lossy replacement character. + // Add chunk up to before this char to `str`. // SAFETY: Caller guarantees `chunk_start` is not after `pos` at start of // this function. `pos` only increases during searching. // Where `chunk_start` is updated, it's always before or equal to `pos`. @@ -293,10 +311,7 @@ impl<'a> Lexer<'a> { // Continue searching true } - _ => { - // `TEMPLATE_LITERAL_TABLE` only matches `$`, '`', `\r` and `\` - debug_assert!(next_byte == b'\\'); - + b'\\' => { // Decode escape sequence into `str`. // `read_string_escape_sequence` expects `self.source` to be positioned after `\`. // SAFETY: Next byte is `\`, which is ASCII, so `pos + 1` is UTF-8 char boundary. @@ -315,6 +330,43 @@ impl<'a> Lexer<'a> { // backwards from that, so subtracting 1 again is within bounds. pos = unsafe {chunk_start.sub(1)}; + // Continue searching + true + } + _ => { + // `TEMPLATE_LITERAL_ESCAPED_MATCH_TABLE` only matches `$`, '`', `\r`, `\`, + // or first byte of lossy replacement character + debug_assert!(next_byte == LOSSY_REPLACEMENT_CHAR_FIRST_BYTE); + + // SAFETY: 0xEF is always first byte of a 3-byte UTF-8 character, + // so there must be 2 more bytes to read + let next2 = unsafe { pos.add(1).read2() }; + if next2 == [LOSSY_REPLACEMENT_CHAR_BYTES[1], LOSSY_REPLACEMENT_CHAR_BYTES[2]] + && self.token.lone_surrogates + { + str.push_str("\u{FFFD}fffd"); + } else { + let bytes = [LOSSY_REPLACEMENT_CHAR_FIRST_BYTE, next2[0], next2[1]]; + // SAFETY: 0xEF is always first byte of a 3-byte UTF-8 character, + // so these 3 bytes must comprise a valid UTF-8 string + let s = unsafe { str::from_utf8_unchecked(&bytes) }; + str.push_str(s); + } + + // Advance past this character. + // SAFETY: Character is 3 bytes, so `pos + 2` is in bounds. + // Note: `byte_search!` macro already advances `pos` by 1, so only + // advance by 2 here, so that in total we skip 3 bytes. + pos = unsafe { pos.add(2) }; + + // Set next chunk to start after this character. + // SAFETY: It's a 3 byte character, and we added 2 to `pos` above, + // so `pos + 1` must be a UTF-8 char boundary. + // This temporarily puts `chunk_start` 1 byte after `pos`, but `byte_search!` macro + // increments `pos` when return `true` from `continue_if`, so `pos` will be + // brought up to `chunk_start` again. + chunk_start = unsafe { pos.add(1) }; + // Continue searching true } diff --git a/napi/parser/deserialize-js.js b/napi/parser/deserialize-js.js index 681690ec1a428..e3f8a2ccc3cf5 100644 --- a/napi/parser/deserialize-js.js +++ b/napi/parser/deserialize-js.js @@ -148,11 +148,16 @@ function deserializeTaggedTemplateExpression(pos) { } function deserializeTemplateElement(pos) { + let value = deserializeTemplateElementValue(pos + 8); + if (value.cooked !== null && deserializeBool(pos + 41)) { + value.cooked = value.cooked + .replace(/\uFFFD(.{4})/g, (_, hex) => String.fromCodePoint(parseInt(hex, 16))); + } return { type: 'TemplateElement', start: deserializeU32(pos), end: deserializeU32(pos + 4), - value: deserializeTemplateElementValue(pos + 8), + value, tail: deserializeBool(pos + 40), }; } diff --git a/napi/parser/deserialize-ts.js b/napi/parser/deserialize-ts.js index 1614ad4fe0201..5b7575dba3fd7 100644 --- a/napi/parser/deserialize-ts.js +++ b/napi/parser/deserialize-ts.js @@ -159,11 +159,16 @@ function deserializeTaggedTemplateExpression(pos) { } function deserializeTemplateElement(pos) { + let value = deserializeTemplateElementValue(pos + 8); + if (value.cooked !== null && deserializeBool(pos + 41)) { + value.cooked = value.cooked + .replace(/\uFFFD(.{4})/g, (_, hex) => String.fromCodePoint(parseInt(hex, 16))); + } return { type: 'TemplateElement', start: deserializeU32(pos), end: deserializeU32(pos + 4), - value: deserializeTemplateElementValue(pos + 8), + value, tail: deserializeBool(pos + 40), }; } diff --git a/napi/parser/test/parse-raw.test.ts b/napi/parser/test/parse-raw.test.ts index 9126ebcc69929..2b13c4282d4a7 100644 --- a/napi/parser/test/parse-raw.test.ts +++ b/napi/parser/test/parse-raw.test.ts @@ -97,6 +97,10 @@ describe('edge cases', () => { ';"\\uD800\\uDBFF";', ';"�\\u{FFFD}";', ';"�\\u{FFFD}\\uD800\\uDBFF�\\u{FFFD}";', + // `TemplateLiteral`s containing lone surrogates and/or lossy replacement characters + '`\\uD800\\uDBFF${x}\\uD800\\uDBFF`;', + '`�\\u{FFFD}${x}�\\u{FFFD}`;', + '`�\\u{FFFD}\\uD800${x}\\uDBFF�\\u{FFFD}`;', ])('%s', (sourceText) => { assertRawAndStandardMatch('dummy.js', sourceText); }); diff --git a/napi/parser/test/parse.test.ts b/napi/parser/test/parse.test.ts index 462b682584d24..e4c6cd8767500 100644 --- a/napi/parser/test/parse.test.ts +++ b/napi/parser/test/parse.test.ts @@ -282,6 +282,146 @@ describe('parse', () => { }); }); + describe('`TemplateLiteral`', () => { + it('lone surrogates', () => { + const ret = parseSync('test.js', '`\\uD800\\uDBFF${x}\\uD800\\uDBFF`;'); + expect(ret.errors.length).toBe(0); + expect(ret.program.body.length).toBe(1); + expect(ret.program.body[0]).toEqual({ + type: 'ExpressionStatement', + start: 0, + end: 31, + expression: { + type: 'TemplateLiteral', + start: 0, + end: 30, + expressions: [ + { + type: 'Identifier', + start: 15, + end: 16, + name: 'x', + }, + ], + quasis: [ + { + type: 'TemplateElement', + start: 1, + end: 13, + value: { + raw: '\\uD800\\uDBFF', + cooked: '\ud800\udbff', + }, + 'tail': false, + }, + { + type: 'TemplateElement', + start: 17, + end: 29, + value: { + raw: '\\uD800\\uDBFF', + cooked: '\ud800\udbff', + }, + 'tail': true, + }, + ], + }, + }); + }); + + it('lossy replacement character', () => { + const ret = parseSync('test.js', '`�\\u{FFFD}${x}�\\u{FFFD}`;'); + expect(ret.errors.length).toBe(0); + expect(ret.program.body.length).toBe(1); + expect(ret.program.body[0]).toEqual({ + type: 'ExpressionStatement', + start: 0, + end: 25, + expression: { + type: 'TemplateLiteral', + start: 0, + end: 24, + expressions: [ + { + type: 'Identifier', + start: 12, + end: 13, + name: 'x', + }, + ], + quasis: [ + { + type: 'TemplateElement', + start: 1, + end: 10, + value: { + raw: '�\\u{FFFD}', + cooked: '��', + }, + 'tail': false, + }, + { + type: 'TemplateElement', + start: 14, + end: 23, + value: { + raw: '�\\u{FFFD}', + cooked: '��', + }, + 'tail': true, + }, + ], + }, + }); + }); + + it('lone surrogates and lossy replacement characters', () => { + const ret = parseSync('test.js', '`�\\u{FFFD}\\uD800${x}\\uDBFF�\\u{FFFD}`;'); + expect(ret.errors.length).toBe(0); + expect(ret.program.body.length).toBe(1); + expect(ret.program.body[0]).toEqual({ + type: 'ExpressionStatement', + start: 0, + end: 37, + expression: { + type: 'TemplateLiteral', + start: 0, + end: 36, + expressions: [ + { + type: 'Identifier', + start: 18, + end: 19, + name: 'x', + }, + ], + quasis: [ + { + type: 'TemplateElement', + start: 1, + end: 16, + value: { + raw: '�\\u{FFFD}\\uD800', + cooked: '��\ud800', + }, + 'tail': false, + }, + { + type: 'TemplateElement', + start: 20, + end: 35, + value: { + raw: '\\uDBFF�\\u{FFFD}', + cooked: '\udbff��', + }, + 'tail': true, + }, + ], + }, + }); + }); + }); + describe('hashbang', () => { it('is `null` when no hashbang', () => { const ret = parseSync('test.js', 'let x;'); diff --git a/tasks/coverage/snapshots/estree_typescript.snap b/tasks/coverage/snapshots/estree_typescript.snap index 0c1cc10bcdd39..d327fb5267f91 100644 --- a/tasks/coverage/snapshots/estree_typescript.snap +++ b/tasks/coverage/snapshots/estree_typescript.snap @@ -4014,10 +4014,10 @@ Mismatch: tasks/coverage/typescript/tests/cases/conformance/es6/unicodeExtendedE Mismatch: tasks/coverage/typescript/tests/cases/conformance/es6/unicodeExtendedEscapes/unicodeExtendedEscapesInTemplates08.ts Mismatch: tasks/coverage/typescript/tests/cases/conformance/es6/unicodeExtendedEscapes/unicodeExtendedEscapesInTemplates09.ts tasks/coverage/typescript/tests/cases/conformance/es6/unicodeExtendedEscapes/unicodeExtendedEscapesInTemplates10.ts -serde_json::from_str(estree_json) error: unexpected end of hex escape at line 37 column 36 +serde_json::from_str(oxc_json) error: unexpected end of hex escape at line 38 column 36 tasks/coverage/typescript/tests/cases/conformance/es6/unicodeExtendedEscapes/unicodeExtendedEscapesInTemplates11.ts -serde_json::from_str(estree_json) error: lone leading surrogate in hex escape at line 37 column 35 +serde_json::from_str(oxc_json) error: lone leading surrogate in hex escape at line 38 column 35 Mismatch: tasks/coverage/typescript/tests/cases/conformance/es6/unicodeExtendedEscapes/unicodeExtendedEscapesInTemplates13.ts Mismatch: tasks/coverage/typescript/tests/cases/conformance/es6/unicodeExtendedEscapes/unicodeExtendedEscapesInTemplates15.ts