oxc-project · graphite-app · Apr 2, 2025 · Apr 2, 2025
diff --git a/crates/oxc_ast/src/ast/js.rs b/crates/oxc_ast/src/ast/js.rs
@@ -461,8 +461,17 @@ pub struct TaggedTemplateExpression<'a> {
 #[generate_derive(CloneIn, Dummy, TakeIn, GetSpan, GetSpanMut, ContentEq, ESTree)]
 pub struct TemplateElement<'a> {
     pub span: Span,
+    #[estree(via = TemplateElementValue)]
     pub value: TemplateElementValue<'a>,
     pub tail: bool,
+    /// The template element contains lone surrogates.
+    ///
+    /// `value.cooked` is encoded using `\u{FFFD}` (the lossy replacement character) as an escape character.
+    /// Lone surrogates are encoded as `\u{FFFD}XXXX`, where `XXXX` is the code unit in hex.
+    /// The lossy escape character itself is encoded as `\u{FFFD}fffd`.
+    #[builder(default)]
+    #[estree(skip)]
+    pub lone_surrogates: bool,
 }
 
 /// See [template-strings-cooked-vs-raw](https://exploringjs.com/js/book/ch_template-literals.html#template-strings-cooked-vs-raw)

diff --git a/crates/oxc_ast/src/generated/assert_layouts.rs b/crates/oxc_ast/src/generated/assert_layouts.rs
@@ -105,6 +105,7 @@ const _: () = {
     assert!(offset_of!(TemplateElement, span) == 0);
     assert!(offset_of!(TemplateElement, value) == 8);
     assert!(offset_of!(TemplateElement, tail) == 40);
+    assert!(offset_of!(TemplateElement, lone_surrogates) == 41);
 
     assert!(size_of::<TemplateElementValue>() == 32);
     assert!(align_of::<TemplateElementValue>() == 8);
@@ -1496,6 +1497,7 @@ const _: () = {
     assert!(offset_of!(TemplateElement, span) == 0);
     assert!(offset_of!(TemplateElement, value) == 8);
     assert!(offset_of!(TemplateElement, tail) == 24);
+    assert!(offset_of!(TemplateElement, lone_surrogates) == 25);
 
     assert!(size_of::<TemplateElementValue>() == 16);
     assert!(align_of::<TemplateElementValue>() == 4);

diff --git a/crates/oxc_ast/src/generated/ast_builder.rs b/crates/oxc_ast/src/generated/ast_builder.rs
@@ -1900,7 +1900,7 @@ impl<'a> AstBuilder<'a> {
         value: TemplateElementValue<'a>,
         tail: bool,
     ) -> TemplateElement<'a> {
-        TemplateElement { span, value, tail }
+        TemplateElement { span, value, tail, lone_surrogates: Default::default() }
     }
 
     /// Build a [`TemplateElement`], and store it in the memory arena.
@@ -1921,6 +1921,49 @@ impl<'a> AstBuilder<'a> {
         Box::new_in(self.template_element(span, value, tail), self.allocator)
     }
 
+    /// Build a [`TemplateElement`] with `lone_surrogates`.
+    ///
+    /// If you want the built node to be allocated in the memory arena, use [`AstBuilder::alloc_template_element_with_lone_surrogates`] instead.
+    ///
+    /// ## Parameters
+    /// * `span`: The [`Span`] covering this node
+    /// * `value`
+    /// * `tail`
+    /// * `lone_surrogates`: The template element contains lone surrogates.
+    #[inline]
+    pub fn template_element_with_lone_surrogates(
+        self,
+        span: Span,
+        value: TemplateElementValue<'a>,
+        tail: bool,
+        lone_surrogates: bool,
+    ) -> TemplateElement<'a> {
+        TemplateElement { span, value, tail, lone_surrogates }
+    }
+
+    /// Build a [`TemplateElement`] with `lone_surrogates`, and store it in the memory arena.
+    ///
+    /// Returns a [`Box`] containing the newly-allocated node. If you want a stack-allocated node, use [`AstBuilder::template_element_with_lone_surrogates`] instead.
+    ///
+    /// ## Parameters
+    /// * `span`: The [`Span`] covering this node
+    /// * `value`
+    /// * `tail`
+    /// * `lone_surrogates`: The template element contains lone surrogates.
+    #[inline]
+    pub fn alloc_template_element_with_lone_surrogates(
+        self,
+        span: Span,
+        value: TemplateElementValue<'a>,
+        tail: bool,
+        lone_surrogates: bool,
+    ) -> Box<'a, TemplateElement<'a>> {
+        Box::new_in(
+            self.template_element_with_lone_surrogates(span, value, tail, lone_surrogates),
+            self.allocator,
+        )
+    }
+
     /// Build a [`MemberExpression::ComputedMemberExpression`].
     ///
     /// This node contains a [`ComputedMemberExpression`] that will be stored in the memory arena.

diff --git a/crates/oxc_ast/src/generated/derive_clone_in.rs b/crates/oxc_ast/src/generated/derive_clone_in.rs
@@ -1097,6 +1097,7 @@ impl<'new_alloc> CloneIn<'new_alloc> for TemplateElement<'_> {
             span: CloneIn::clone_in(&self.span, allocator),
             value: CloneIn::clone_in(&self.value, allocator),
             tail: CloneIn::clone_in(&self.tail, allocator),
+            lone_surrogates: CloneIn::clone_in(&self.lone_surrogates, allocator),
         }
     }
 
@@ -1105,6 +1106,7 @@ impl<'new_alloc> CloneIn<'new_alloc> for TemplateElement<'_> {
             span: CloneIn::clone_in_with_semantic_ids(&self.span, allocator),
             value: CloneIn::clone_in_with_semantic_ids(&self.value, allocator),
             tail: CloneIn::clone_in_with_semantic_ids(&self.tail, allocator),
+            lone_surrogates: CloneIn::clone_in_with_semantic_ids(&self.lone_surrogates, allocator),
         }
     }
 }

diff --git a/crates/oxc_ast/src/generated/derive_content_eq.rs b/crates/oxc_ast/src/generated/derive_content_eq.rs
@@ -291,6 +291,7 @@ impl ContentEq for TemplateElement<'_> {
     fn content_eq(&self, other: &Self) -> bool {
         ContentEq::content_eq(&self.value, &other.value)
             && ContentEq::content_eq(&self.tail, &other.tail)
+            && ContentEq::content_eq(&self.lone_surrogates, &other.lone_surrogates)
     }
 }
 

diff --git a/crates/oxc_ast/src/generated/derive_dummy.rs b/crates/oxc_ast/src/generated/derive_dummy.rs
@@ -215,6 +215,7 @@ impl<'a> Dummy<'a> for TemplateElement<'a> {
             span: Dummy::dummy(allocator),
             value: Dummy::dummy(allocator),
             tail: Dummy::dummy(allocator),
+            lone_surrogates: Dummy::dummy(allocator),
         }
     }
 }

diff --git a/crates/oxc_ast/src/generated/derive_estree.rs b/crates/oxc_ast/src/generated/derive_estree.rs
@@ -342,7 +342,7 @@ impl ESTree for TemplateElement<'_> {
         state.serialize_field("type", &JsonSafeString("TemplateElement"));
         state.serialize_field("start", &self.span.start);
         state.serialize_field("end", &self.span.end);
-        state.serialize_field("value", &self.value);
+        state.serialize_field("value", &crate::serialize::TemplateElementValue(self));
         state.serialize_field("tail", &self.tail);
         state.end();
     }

diff --git a/crates/oxc_ast/src/serialize.rs b/crates/oxc_ast/src/serialize.rs
@@ -357,6 +357,52 @@ impl ESTree for RegExpFlagsConverter<'_> {
     }
 }
 
+/// Serializer for `value` field of `TemplateElement`.
+///
+/// Handle when `lone_surrogates` flag is set, indicating the cooked string contains lone surrogates.
+#[ast_meta]
+#[estree(
+    ts_type = "TemplateElementValue",
+    raw_deser = r#"
+        let value = DESER[TemplateElementValue](POS_OFFSET.value);
+        if (value.cooked !== null && DESER[bool](POS_OFFSET.lone_surrogates)) {
+            value.cooked = value.cooked
+                .replace(/\uFFFD(.{4})/g, (_, hex) => String.fromCodePoint(parseInt(hex, 16)));
+        }
+        value
+    "#
+)]
+pub struct TemplateElementValue<'a, 'b>(pub &'b TemplateElement<'a>);
+
+impl ESTree for TemplateElementValue<'_, '_> {
+    fn serialize<S: Serializer>(&self, serializer: S) {
+        let element = self.0;
+        #[expect(clippy::if_not_else)]
+        if !element.lone_surrogates {
+            element.value.serialize(serializer);
+        } else {
+            // String contains lone surrogates
+            self.serialize_lone_surrogates(serializer);
+        }
+    }
+}
+
+impl TemplateElementValue<'_, '_> {
+    #[cold]
+    #[inline(never)]
+    fn serialize_lone_surrogates<S: Serializer>(&self, serializer: S) {
+        let value = &self.0.value;
+
+        let mut state = serializer.serialize_struct();
+        state.serialize_field("raw", &value.raw);
+
+        let cooked = value.cooked.as_ref().map(|cooked| LoneSurrogatesString(cooked.as_str()));
+        state.serialize_field("cooked", &cooked);
+
+        state.end();
+    }
+}
+
 // --------------------
 // Various
 // --------------------

diff --git a/crates/oxc_parser/src/js/expression.rs b/crates/oxc_parser/src/js/expression.rs
@@ -540,6 +540,7 @@ impl<'a> ParserImpl<'a> {
         // `cooked = None` when template literal has invalid escape sequence
         // This is matched by `is_valid_escape_sequence` in `Lexer::read_template_literal`
         let cooked = self.cur_template_string();
+        let lone_surrogates = self.cur_token().lone_surrogates;
 
         let cur_src = self.cur_src();
         let raw = &cur_src[1..cur_src.len() - end_offset as usize];
@@ -560,10 +561,11 @@ impl<'a> ParserImpl<'a> {
         }
 
         let tail = matches!(cur_kind, Kind::TemplateTail | Kind::NoSubstitutionTemplate);
-        self.ast.template_element(
+        self.ast.template_element_with_lone_surrogates(
             span,
             TemplateElementValue { raw, cooked: cooked.map(Atom::from) },
             tail,
+            lone_surrogates,
         )
     }
 

diff --git a/crates/oxc_parser/src/lexer/template.rs b/crates/oxc_parser/src/lexer/template.rs
@@ -1,4 +1,4 @@
-use std::cmp::max;
+use std::{cmp::max, str};
 
 use oxc_allocator::String;
 
@@ -11,9 +11,26 @@ use super::{
 
 const MIN_ESCAPED_TEMPLATE_LIT_LEN: usize = 16;
 
+/// Convert `char` to UTF-8 bytes array.
+const fn to_bytes<const N: usize>(ch: char) -> [u8; N] {
+    let mut bytes = [0u8; N];
+    ch.encode_utf8(&mut bytes);
+    bytes
+}
+
+/// Lossy replacement character (U+FFFD) as UTF-8 bytes.
+const LOSSY_REPLACEMENT_CHAR_BYTES: [u8; 3] = to_bytes('\u{FFFD}');
+const LOSSY_REPLACEMENT_CHAR_FIRST_BYTE: u8 = LOSSY_REPLACEMENT_CHAR_BYTES[0];
+const _: () = assert!(LOSSY_REPLACEMENT_CHAR_FIRST_BYTE == 0xEF);
+
 static TEMPLATE_LITERAL_TABLE: SafeByteMatchTable =
     safe_byte_match_table!(|b| matches!(b, b'$' | b'`' | b'\r' | b'\\'));
 
+// Same as above, but with 1st byte of lossy replacement character added
+static TEMPLATE_LITERAL_ESCAPED_MATCH_TABLE: SafeByteMatchTable = safe_byte_match_table!(
+    |b| matches!(b, b'$' | b'`' | b'\r' | b'\\' | LOSSY_REPLACEMENT_CHAR_FIRST_BYTE)
+);
+
 /// 12.8.6 Template Literal Lexical Components
 impl<'a> Lexer<'a> {
     /// Read template literal component.
@@ -206,7 +223,7 @@ impl<'a> Lexer<'a> {
 
         byte_search! {
             lexer: self,
-            table: TEMPLATE_LITERAL_TABLE,
+            table: TEMPLATE_LITERAL_ESCAPED_MATCH_TABLE,
             start: pos,
             continue_if: (next_byte, pos) {
                 if next_byte == b'$' {
@@ -238,7 +255,8 @@ impl<'a> Lexer<'a> {
                         cold_branch(|| true)
                     }
                 } else {
-                    // Next byte is '`', `\r` or `\`. Add chunk up to before this char to `str`.
+                    // Next byte is '`', `\r`, `\`, or first byte of lossy replacement character.
+                    // Add chunk up to before this char to `str`.
                     // SAFETY: Caller guarantees `chunk_start` is not after `pos` at start of
                     // this function. `pos` only increases during searching.
                     // Where `chunk_start` is updated, it's always before or equal to `pos`.
@@ -293,10 +311,7 @@ impl<'a> Lexer<'a> {
                             // Continue searching
                             true
                         }
-                        _ => {
-                            // `TEMPLATE_LITERAL_TABLE` only matches `$`, '`', `\r` and `\`
-                            debug_assert!(next_byte == b'\\');
-
+                        b'\\' => {
                             // Decode escape sequence into `str`.
                             // `read_string_escape_sequence` expects `self.source` to be positioned after `\`.
                             // SAFETY: Next byte is `\`, which is ASCII, so `pos + 1` is UTF-8 char boundary.
@@ -315,6 +330,43 @@ impl<'a> Lexer<'a> {
                             // backwards from that, so subtracting 1 again is within bounds.
                             pos = unsafe {chunk_start.sub(1)};
 
+                            // Continue searching
+                            true
+                        }
+                        _ => {
+                            // `TEMPLATE_LITERAL_ESCAPED_MATCH_TABLE` only matches `$`, '`', `\r`, `\`,
+                            // or first byte of lossy replacement character
+                            debug_assert!(next_byte == LOSSY_REPLACEMENT_CHAR_FIRST_BYTE);
+
+                            // SAFETY: 0xEF is always first byte of a 3-byte UTF-8 character,
+                            // so there must be 2 more bytes to read
+                            let next2 = unsafe { pos.add(1).read2() };
+                            if next2 == [LOSSY_REPLACEMENT_CHAR_BYTES[1], LOSSY_REPLACEMENT_CHAR_BYTES[2]]
+                                && self.token.lone_surrogates
+                            {
+                                str.push_str("\u{FFFD}fffd");
+                            } else {
+                                let bytes = [LOSSY_REPLACEMENT_CHAR_FIRST_BYTE, next2[0], next2[1]];
+                                // SAFETY: 0xEF is always first byte of a 3-byte UTF-8 character,
+                                // so these 3 bytes must comprise a valid UTF-8 string
+                                let s = unsafe { str::from_utf8_unchecked(&bytes) };
+                                str.push_str(s);
+                            }
+
+                            // Advance past this character.
+                            // SAFETY: Character is 3 bytes, so `pos + 2` is in bounds.
+                            // Note: `byte_search!` macro already advances `pos` by 1, so only
+                            // advance by 2 here, so that in total we skip 3 bytes.
+                            pos = unsafe { pos.add(2) };
+
+                            // Set next chunk to start after this character.
+                            // SAFETY: It's a 3 byte character, and we added 2 to `pos` above,
+                            // so `pos + 1` must be a UTF-8 char boundary.
+                            // This temporarily puts `chunk_start` 1 byte after `pos`, but `byte_search!` macro
+                            // increments `pos` when return `true` from `continue_if`, so `pos` will be
+                            // brought up to `chunk_start` again.
+                            chunk_start = unsafe { pos.add(1) };
+
                             // Continue searching
                             true
                         }

diff --git a/napi/parser/deserialize-js.js b/napi/parser/deserialize-js.js
@@ -148,11 +148,16 @@ function deserializeTaggedTemplateExpression(pos) {
 }
 
 function deserializeTemplateElement(pos) {
+  let value = deserializeTemplateElementValue(pos + 8);
+  if (value.cooked !== null && deserializeBool(pos + 41)) {
+    value.cooked = value.cooked
+      .replace(/\uFFFD(.{4})/g, (_, hex) => String.fromCodePoint(parseInt(hex, 16)));
+  }
   return {
     type: 'TemplateElement',
     start: deserializeU32(pos),
     end: deserializeU32(pos + 4),
-    value: deserializeTemplateElementValue(pos + 8),
+    value,
     tail: deserializeBool(pos + 40),
   };
 }

diff --git a/napi/parser/deserialize-ts.js b/napi/parser/deserialize-ts.js
@@ -159,11 +159,16 @@ function deserializeTaggedTemplateExpression(pos) {
 }
 
 function deserializeTemplateElement(pos) {
+  let value = deserializeTemplateElementValue(pos + 8);
+  if (value.cooked !== null && deserializeBool(pos + 41)) {
+    value.cooked = value.cooked
+      .replace(/\uFFFD(.{4})/g, (_, hex) => String.fromCodePoint(parseInt(hex, 16)));
+  }
   return {
     type: 'TemplateElement',
     start: deserializeU32(pos),
     end: deserializeU32(pos + 4),
-    value: deserializeTemplateElementValue(pos + 8),
+    value,
     tail: deserializeBool(pos + 40),
   };
 }

diff --git a/napi/parser/test/parse-raw.test.ts b/napi/parser/test/parse-raw.test.ts
@@ -97,6 +97,10 @@ describe('edge cases', () => {
     ';"\\uD800\\uDBFF";',
     ';"�\\u{FFFD}";',
     ';"�\\u{FFFD}\\uD800\\uDBFF�\\u{FFFD}";',
+    // `TemplateLiteral`s containing lone surrogates and/or lossy replacement characters
+    '`\\uD800\\uDBFF${x}\\uD800\\uDBFF`;',
+    '`�\\u{FFFD}${x}�\\u{FFFD}`;',
+    '`�\\u{FFFD}\\uD800${x}\\uDBFF�\\u{FFFD}`;',
   ])('%s', (sourceText) => {
     assertRawAndStandardMatch('dummy.js', sourceText);
   });