diff --git a/crates/oxc_ast/src/ast/literal.rs b/crates/oxc_ast/src/ast/literal.rs index 11f7aaa24d225..a03d2d8e466c4 100644 --- a/crates/oxc_ast/src/ast/literal.rs +++ b/crates/oxc_ast/src/ast/literal.rs @@ -85,10 +85,14 @@ pub struct StringLiteral<'a> { #[content_eq(skip)] pub raw: Option>, - /// The string value contains replacement character (U+FFFD). + /// The string value contains lone surrogates. + /// + /// `value` is encoded using `\u{FFFD}` (the lossy replacement character) as an escape character. + /// Lone surrogates are encoded as `\u{FFFD}XXXX`, where `XXXX` is the code unit in hex. + /// The lossy escape character itself is encoded as `\u{FFFD}fffd`. #[builder(default)] #[estree(skip)] - pub lossy: bool, + pub lone_surrogates: bool, } /// BigInt literal diff --git a/crates/oxc_ast/src/generated/assert_layouts.rs b/crates/oxc_ast/src/generated/assert_layouts.rs index f45db57d750b2..192583465e212 100644 --- a/crates/oxc_ast/src/generated/assert_layouts.rs +++ b/crates/oxc_ast/src/generated/assert_layouts.rs @@ -776,7 +776,7 @@ const _: () = { assert!(offset_of!(StringLiteral, span) == 0); assert!(offset_of!(StringLiteral, value) == 8); assert!(offset_of!(StringLiteral, raw) == 24); - assert!(offset_of!(StringLiteral, lossy) == 40); + assert!(offset_of!(StringLiteral, lone_surrogates) == 40); assert!(size_of::() == 32); assert!(align_of::() == 8); @@ -2167,7 +2167,7 @@ const _: () = { assert!(offset_of!(StringLiteral, span) == 0); assert!(offset_of!(StringLiteral, value) == 8); assert!(offset_of!(StringLiteral, raw) == 16); - assert!(offset_of!(StringLiteral, lossy) == 24); + assert!(offset_of!(StringLiteral, lone_surrogates) == 24); assert!(size_of::() == 20); assert!(align_of::() == 4); diff --git a/crates/oxc_ast/src/generated/ast_builder.rs b/crates/oxc_ast/src/generated/ast_builder.rs index 3c1385074b2c9..61c88f90b02dc 100644 --- a/crates/oxc_ast/src/generated/ast_builder.rs +++ b/crates/oxc_ast/src/generated/ast_builder.rs @@ -276,7 +276,7 @@ impl<'a> AstBuilder<'a> { Expression::StringLiteral(self.alloc_string_literal(span, value, raw)) } - /// Build an [`Expression::StringLiteral`] with `lossy`. + /// Build an [`Expression::StringLiteral`] with `lone_surrogates`. /// /// This node contains a [`StringLiteral`] that will be stored in the memory arena. /// @@ -284,19 +284,24 @@ impl<'a> AstBuilder<'a> { /// * `span`: Node location in source code /// * `value`: The value of the string. /// * `raw`: The raw string as it appears in source code. - /// * `lossy`: The string value contains replacement character (U+FFFD). + /// * `lone_surrogates`: The string value contains lone surrogates. #[inline] - pub fn expression_string_literal_with_lossy( + pub fn expression_string_literal_with_lone_surrogates( self, span: Span, value: A, raw: Option>, - lossy: bool, + lone_surrogates: bool, ) -> Expression<'a> where A: IntoIn<'a, Atom<'a>>, { - Expression::StringLiteral(self.alloc_string_literal_with_lossy(span, value, raw, lossy)) + Expression::StringLiteral(self.alloc_string_literal_with_lone_surrogates( + span, + value, + raw, + lone_surrogates, + )) } /// Build an [`Expression::TemplateLiteral`]. @@ -7843,25 +7848,30 @@ impl<'a> AstBuilder<'a> { ImportAttributeKey::StringLiteral(self.string_literal(span, value, raw)) } - /// Build an [`ImportAttributeKey::StringLiteral`] with `lossy`. + /// Build an [`ImportAttributeKey::StringLiteral`] with `lone_surrogates`. /// /// ## Parameters /// * `span`: Node location in source code /// * `value`: The value of the string. /// * `raw`: The raw string as it appears in source code. - /// * `lossy`: The string value contains replacement character (U+FFFD). + /// * `lone_surrogates`: The string value contains lone surrogates. #[inline] - pub fn import_attribute_key_string_literal_with_lossy( + pub fn import_attribute_key_string_literal_with_lone_surrogates( self, span: Span, value: A, raw: Option>, - lossy: bool, + lone_surrogates: bool, ) -> ImportAttributeKey<'a> where A: IntoIn<'a, Atom<'a>>, { - ImportAttributeKey::StringLiteral(self.string_literal_with_lossy(span, value, raw, lossy)) + ImportAttributeKey::StringLiteral(self.string_literal_with_lone_surrogates( + span, + value, + raw, + lone_surrogates, + )) } /// Build an [`ExportNamedDeclaration`]. @@ -8442,25 +8452,30 @@ impl<'a> AstBuilder<'a> { ModuleExportName::StringLiteral(self.string_literal(span, value, raw)) } - /// Build a [`ModuleExportName::StringLiteral`] with `lossy`. + /// Build a [`ModuleExportName::StringLiteral`] with `lone_surrogates`. /// /// ## Parameters /// * `span`: Node location in source code /// * `value`: The value of the string. /// * `raw`: The raw string as it appears in source code. - /// * `lossy`: The string value contains replacement character (U+FFFD). + /// * `lone_surrogates`: The string value contains lone surrogates. #[inline] - pub fn module_export_name_string_literal_with_lossy( + pub fn module_export_name_string_literal_with_lone_surrogates( self, span: Span, value: A, raw: Option>, - lossy: bool, + lone_surrogates: bool, ) -> ModuleExportName<'a> where A: IntoIn<'a, Atom<'a>>, { - ModuleExportName::StringLiteral(self.string_literal_with_lossy(span, value, raw, lossy)) + ModuleExportName::StringLiteral(self.string_literal_with_lone_surrogates( + span, + value, + raw, + lone_surrogates, + )) } /// Build a [`V8IntrinsicExpression`]. @@ -8598,7 +8613,12 @@ impl<'a> AstBuilder<'a> { where A: IntoIn<'a, Atom<'a>>, { - StringLiteral { span, value: value.into_in(self.allocator), raw, lossy: Default::default() } + StringLiteral { + span, + value: value.into_in(self.allocator), + raw, + lone_surrogates: Default::default(), + } } /// Build a [`StringLiteral`], and store it in the memory arena. @@ -8622,50 +8642,53 @@ impl<'a> AstBuilder<'a> { Box::new_in(self.string_literal(span, value, raw), self.allocator) } - /// Build a [`StringLiteral`] with `lossy`. + /// Build a [`StringLiteral`] with `lone_surrogates`. /// - /// If you want the built node to be allocated in the memory arena, use [`AstBuilder::alloc_string_literal_with_lossy`] instead. + /// If you want the built node to be allocated in the memory arena, use [`AstBuilder::alloc_string_literal_with_lone_surrogates`] instead. /// /// ## Parameters /// * `span`: Node location in source code /// * `value`: The value of the string. /// * `raw`: The raw string as it appears in source code. - /// * `lossy`: The string value contains replacement character (U+FFFD). + /// * `lone_surrogates`: The string value contains lone surrogates. #[inline] - pub fn string_literal_with_lossy( + pub fn string_literal_with_lone_surrogates( self, span: Span, value: A, raw: Option>, - lossy: bool, + lone_surrogates: bool, ) -> StringLiteral<'a> where A: IntoIn<'a, Atom<'a>>, { - StringLiteral { span, value: value.into_in(self.allocator), raw, lossy } + StringLiteral { span, value: value.into_in(self.allocator), raw, lone_surrogates } } - /// Build a [`StringLiteral`] with `lossy`, and store it in the memory arena. + /// Build a [`StringLiteral`] with `lone_surrogates`, and store it in the memory arena. /// - /// Returns a [`Box`] containing the newly-allocated node. If you want a stack-allocated node, use [`AstBuilder::string_literal_with_lossy`] instead. + /// Returns a [`Box`] containing the newly-allocated node. If you want a stack-allocated node, use [`AstBuilder::string_literal_with_lone_surrogates`] instead. /// /// ## Parameters /// * `span`: Node location in source code /// * `value`: The value of the string. /// * `raw`: The raw string as it appears in source code. - /// * `lossy`: The string value contains replacement character (U+FFFD). + /// * `lone_surrogates`: The string value contains lone surrogates. #[inline] - pub fn alloc_string_literal_with_lossy( + pub fn alloc_string_literal_with_lone_surrogates( self, span: Span, value: A, raw: Option>, - lossy: bool, + lone_surrogates: bool, ) -> Box<'a, StringLiteral<'a>> where A: IntoIn<'a, Atom<'a>>, { - Box::new_in(self.string_literal_with_lossy(span, value, raw, lossy), self.allocator) + Box::new_in( + self.string_literal_with_lone_surrogates(span, value, raw, lone_surrogates), + self.allocator, + ) } /// Build a [`BigIntLiteral`]. @@ -9444,7 +9467,7 @@ impl<'a> AstBuilder<'a> { JSXAttributeValue::StringLiteral(self.alloc_string_literal(span, value, raw)) } - /// Build a [`JSXAttributeValue::StringLiteral`] with `lossy`. + /// Build a [`JSXAttributeValue::StringLiteral`] with `lone_surrogates`. /// /// This node contains a [`StringLiteral`] that will be stored in the memory arena. /// @@ -9452,21 +9475,24 @@ impl<'a> AstBuilder<'a> { /// * `span`: Node location in source code /// * `value`: The value of the string. /// * `raw`: The raw string as it appears in source code. - /// * `lossy`: The string value contains replacement character (U+FFFD). + /// * `lone_surrogates`: The string value contains lone surrogates. #[inline] - pub fn jsx_attribute_value_string_literal_with_lossy( + pub fn jsx_attribute_value_string_literal_with_lone_surrogates( self, span: Span, value: A, raw: Option>, - lossy: bool, + lone_surrogates: bool, ) -> JSXAttributeValue<'a> where A: IntoIn<'a, Atom<'a>>, { - JSXAttributeValue::StringLiteral( - self.alloc_string_literal_with_lossy(span, value, raw, lossy), - ) + JSXAttributeValue::StringLiteral(self.alloc_string_literal_with_lone_surrogates( + span, + value, + raw, + lone_surrogates, + )) } /// Build a [`JSXAttributeValue::ExpressionContainer`]. @@ -9949,7 +9975,7 @@ impl<'a> AstBuilder<'a> { TSEnumMemberName::String(self.alloc_string_literal(span, value, raw)) } - /// Build a [`TSEnumMemberName::String`] with `lossy`. + /// Build a [`TSEnumMemberName::String`] with `lone_surrogates`. /// /// This node contains a [`StringLiteral`] that will be stored in the memory arena. /// @@ -9957,19 +9983,24 @@ impl<'a> AstBuilder<'a> { /// * `span`: Node location in source code /// * `value`: The value of the string. /// * `raw`: The raw string as it appears in source code. - /// * `lossy`: The string value contains replacement character (U+FFFD). + /// * `lone_surrogates`: The string value contains lone surrogates. #[inline] - pub fn ts_enum_member_name_string_with_lossy( + pub fn ts_enum_member_name_string_with_lone_surrogates( self, span: Span, value: A, raw: Option>, - lossy: bool, + lone_surrogates: bool, ) -> TSEnumMemberName<'a> where A: IntoIn<'a, Atom<'a>>, { - TSEnumMemberName::String(self.alloc_string_literal_with_lossy(span, value, raw, lossy)) + TSEnumMemberName::String(self.alloc_string_literal_with_lone_surrogates( + span, + value, + raw, + lone_surrogates, + )) } /// Build a [`TSTypeAnnotation`]. @@ -10106,7 +10137,7 @@ impl<'a> AstBuilder<'a> { TSLiteral::StringLiteral(self.alloc_string_literal(span, value, raw)) } - /// Build a [`TSLiteral::StringLiteral`] with `lossy`. + /// Build a [`TSLiteral::StringLiteral`] with `lone_surrogates`. /// /// This node contains a [`StringLiteral`] that will be stored in the memory arena. /// @@ -10114,19 +10145,24 @@ impl<'a> AstBuilder<'a> { /// * `span`: Node location in source code /// * `value`: The value of the string. /// * `raw`: The raw string as it appears in source code. - /// * `lossy`: The string value contains replacement character (U+FFFD). + /// * `lone_surrogates`: The string value contains lone surrogates. #[inline] - pub fn ts_literal_string_literal_with_lossy( + pub fn ts_literal_string_literal_with_lone_surrogates( self, span: Span, value: A, raw: Option>, - lossy: bool, + lone_surrogates: bool, ) -> TSLiteral<'a> where A: IntoIn<'a, Atom<'a>>, { - TSLiteral::StringLiteral(self.alloc_string_literal_with_lossy(span, value, raw, lossy)) + TSLiteral::StringLiteral(self.alloc_string_literal_with_lone_surrogates( + span, + value, + raw, + lone_surrogates, + )) } /// Build a [`TSLiteral::TemplateLiteral`]. @@ -13387,27 +13423,30 @@ impl<'a> AstBuilder<'a> { TSModuleDeclarationName::StringLiteral(self.string_literal(span, value, raw)) } - /// Build a [`TSModuleDeclarationName::StringLiteral`] with `lossy`. + /// Build a [`TSModuleDeclarationName::StringLiteral`] with `lone_surrogates`. /// /// ## Parameters /// * `span`: Node location in source code /// * `value`: The value of the string. /// * `raw`: The raw string as it appears in source code. - /// * `lossy`: The string value contains replacement character (U+FFFD). + /// * `lone_surrogates`: The string value contains lone surrogates. #[inline] - pub fn ts_module_declaration_name_string_literal_with_lossy( + pub fn ts_module_declaration_name_string_literal_with_lone_surrogates( self, span: Span, value: A, raw: Option>, - lossy: bool, + lone_surrogates: bool, ) -> TSModuleDeclarationName<'a> where A: IntoIn<'a, Atom<'a>>, { - TSModuleDeclarationName::StringLiteral( - self.string_literal_with_lossy(span, value, raw, lossy), - ) + TSModuleDeclarationName::StringLiteral(self.string_literal_with_lone_surrogates( + span, + value, + raw, + lone_surrogates, + )) } /// Build a [`TSModuleDeclarationBody::TSModuleDeclaration`]. diff --git a/crates/oxc_ast/src/generated/derive_clone_in.rs b/crates/oxc_ast/src/generated/derive_clone_in.rs index 7783a17120e70..fe6b4d8a92d79 100644 --- a/crates/oxc_ast/src/generated/derive_clone_in.rs +++ b/crates/oxc_ast/src/generated/derive_clone_in.rs @@ -4843,7 +4843,7 @@ impl<'new_alloc> CloneIn<'new_alloc> for StringLiteral<'_> { span: CloneIn::clone_in(&self.span, allocator), value: CloneIn::clone_in(&self.value, allocator), raw: CloneIn::clone_in(&self.raw, allocator), - lossy: CloneIn::clone_in(&self.lossy, allocator), + lone_surrogates: CloneIn::clone_in(&self.lone_surrogates, allocator), } } @@ -4852,7 +4852,7 @@ impl<'new_alloc> CloneIn<'new_alloc> for StringLiteral<'_> { span: CloneIn::clone_in_with_semantic_ids(&self.span, allocator), value: CloneIn::clone_in_with_semantic_ids(&self.value, allocator), raw: CloneIn::clone_in_with_semantic_ids(&self.raw, allocator), - lossy: CloneIn::clone_in_with_semantic_ids(&self.lossy, allocator), + lone_surrogates: CloneIn::clone_in_with_semantic_ids(&self.lone_surrogates, allocator), } } } diff --git a/crates/oxc_ast/src/generated/derive_content_eq.rs b/crates/oxc_ast/src/generated/derive_content_eq.rs index f21c825d68a08..e9cb8d3c237f8 100644 --- a/crates/oxc_ast/src/generated/derive_content_eq.rs +++ b/crates/oxc_ast/src/generated/derive_content_eq.rs @@ -1471,7 +1471,7 @@ impl ContentEq for NumericLiteral<'_> { impl ContentEq for StringLiteral<'_> { fn content_eq(&self, other: &Self) -> bool { ContentEq::content_eq(&self.value, &other.value) - && ContentEq::content_eq(&self.lossy, &other.lossy) + && ContentEq::content_eq(&self.lone_surrogates, &other.lone_surrogates) } } diff --git a/crates/oxc_ast/src/generated/derive_dummy.rs b/crates/oxc_ast/src/generated/derive_dummy.rs index b113c775cb348..3c3e7960d3285 100644 --- a/crates/oxc_ast/src/generated/derive_dummy.rs +++ b/crates/oxc_ast/src/generated/derive_dummy.rs @@ -1597,7 +1597,7 @@ impl<'a> Dummy<'a> for StringLiteral<'a> { span: Dummy::dummy(allocator), value: Dummy::dummy(allocator), raw: Dummy::dummy(allocator), - lossy: Dummy::dummy(allocator), + lone_surrogates: Dummy::dummy(allocator), } } } diff --git a/crates/oxc_ast/src/serialize.rs b/crates/oxc_ast/src/serialize.rs index aad56b4df31a5..b7177a79a69b2 100644 --- a/crates/oxc_ast/src/serialize.rs +++ b/crates/oxc_ast/src/serialize.rs @@ -1,11 +1,9 @@ -use std::borrow::Cow; - use cow_utils::CowUtils; use oxc_ast_macros::ast_meta; use oxc_estree::{ - CompactJSSerializer, CompactTSSerializer, ESTree, JsonSafeString, PrettyJSSerializer, - PrettyTSSerializer, SequenceSerializer, Serializer, StructSerializer, + CompactJSSerializer, CompactTSSerializer, ESTree, JsonSafeString, LoneSurrogatesString, + PrettyJSSerializer, PrettyTSSerializer, SequenceSerializer, Serializer, StructSerializer, }; use crate::ast::*; @@ -233,10 +231,13 @@ impl ESTree for NullLiteralRaw<'_> { #[ast_meta] #[estree( ts_type = "string", - raw_deser = " - const lossy = DESER[bool](POS_OFFSET.lossy); - (THIS.lossy && THIS.raw !== null) ? (0, eval)(THIS.raw) : DESER[Atom](POS_OFFSET.value) - " + raw_deser = r#" + let value = DESER[Atom](POS_OFFSET.value); + if (DESER[bool](POS_OFFSET.lone_surrogates)) { + value = value.replace(/\uFFFD(.{4})/g, (_, hex) => String.fromCodePoint(parseInt(hex, 16))); + } + value + "# )] pub struct StringLiteralValue<'a, 'b>(pub &'b StringLiteral<'a>); @@ -244,10 +245,11 @@ impl ESTree for StringLiteralValue<'_, '_> { fn serialize(&self, serializer: S) { let lit = self.0; #[expect(clippy::if_not_else)] - if !lit.lossy { + if !lit.lone_surrogates { lit.value.serialize(serializer); } else { - self.serialize_lossy(serializer); + // String contains lone surrogates + self.serialize_lone_surrogates(serializer); } } } @@ -255,42 +257,8 @@ impl ESTree for StringLiteralValue<'_, '_> { impl StringLiteralValue<'_, '_> { #[cold] #[inline(never)] - fn serialize_lossy(&self, serializer: S) { - // String contains lone surrogates - let lit = self.0; - let raw = - lit.raw.expect("`StringLiteral` with `lossy` flag set must have `raw` field populated"); - let raw = raw.as_str(); - let quote = raw.as_bytes().first().copied().unwrap(); - let raw_unquoted = &raw[1..raw.len() - 1]; - let raw_with_quotes_escaped = if quote == b'"' { - // String was in double quotes in original source, so it's valid JSON - Cow::Borrowed(raw_unquoted) - } else { - // String was in single quotes in original source. - // We need to replace any `"` characters with `\"`, and then it's valid JSON. - // We *don't* escape `"` characters preceded by an odd number of `\`s, - // because they're already escaped. - // We could make this more performant, but it should be vanishingly rare to hit this path anyway. - let mut encoded = Vec::with_capacity(raw_unquoted.len()); - let mut has_slash = false; - for &b in raw_unquoted.as_bytes() { - if b == b'\\' { - has_slash = !has_slash; - } else if b == b'"' && !has_slash { - encoded.push(b'\\'); - } else { - has_slash = false; - } - encoded.push(b); - } - - // SAFETY: `raw_without_quotes` is a `&str`. `encoded` contains the exact same bytes, - // except we may have inserted `\` before `"`s. That cannot create invalid UTF-8. - let encoded = unsafe { String::from_utf8_unchecked(encoded) }; - Cow::Owned(encoded) - }; - JsonSafeString(&raw_with_quotes_escaped).serialize(serializer); + fn serialize_lone_surrogates(&self, serializer: S) { + LoneSurrogatesString(self.0.value.as_str()).serialize(serializer); } } diff --git a/crates/oxc_codegen/src/lib.rs b/crates/oxc_codegen/src/lib.rs index b74a3db51e4f0..acb413068bff7 100644 --- a/crates/oxc_codegen/src/lib.rs +++ b/crates/oxc_codegen/src/lib.rs @@ -578,7 +578,7 @@ impl<'a> Codegen<'a> { fn print_string_literal(&mut self, s: &StringLiteral<'_>, allow_backtick: bool) { self.add_source_mapping(s.span); - if s.lossy { + if s.lone_surrogates { self.print_str(s.raw.unwrap().as_str()); return; } diff --git a/crates/oxc_estree/src/serialize/mod.rs b/crates/oxc_estree/src/serialize/mod.rs index 9727f1855763c..c07475f76355f 100644 --- a/crates/oxc_estree/src/serialize/mod.rs +++ b/crates/oxc_estree/src/serialize/mod.rs @@ -16,7 +16,7 @@ use sequences::ESTreeSequenceSerializer; use structs::ESTreeStructSerializer; pub use sequences::SequenceSerializer; -pub use strings::JsonSafeString; +pub use strings::{JsonSafeString, LoneSurrogatesString}; pub use structs::{FlatStructSerializer, StructSerializer}; /// Trait for types which can be serialized to ESTree. diff --git a/crates/oxc_estree/src/serialize/strings.rs b/crates/oxc_estree/src/serialize/strings.rs index 336a502a47e33..1b83cfb69ba5c 100644 --- a/crates/oxc_estree/src/serialize/strings.rs +++ b/crates/oxc_estree/src/serialize/strings.rs @@ -2,6 +2,17 @@ use oxc_data_structures::code_buffer::CodeBuffer; use super::{ESTree, Serializer}; +/// Convert `char` to UTF-8 bytes array. +const fn to_bytes(ch: char) -> [u8; N] { + let mut bytes = [0u8; N]; + ch.encode_utf8(&mut bytes); + bytes +} + +/// Lossy replacement character (U+FFFD) as UTF-8 bytes. +const LOSSY_REPLACEMENT_CHAR_BYTES: [u8; 3] = to_bytes('\u{FFFD}'); +const LOSSY_REPLACEMENT_CHAR_FIRST_BYTE: u8 = LOSSY_REPLACEMENT_CHAR_BYTES[0]; // 0xEF + /// A string which does not need any escaping in JSON. /// /// This provides better performance when you know that the string definitely contains no characters @@ -20,17 +31,32 @@ impl ESTree for JsonSafeString<'_> { } } +/// A string which contains lone surrogates escaped with lossy replacement character (U+FFFD). +/// +/// Lone surrogates are encoded in the string as `\uFFFD1234` where `1234` is the code point in hex. +/// These are converted to `\u1234` in JSON. +/// An actual lossy replacement character is encoded in the string as `\uFFFDfffd`, and is converted +/// to the actual character. +pub struct LoneSurrogatesString<'s>(pub &'s str); + +impl ESTree for LoneSurrogatesString<'_> { + #[inline(always)] + fn serialize(&self, mut serializer: S) { + write_str(self.0, &ESCAPE_LONE_SURROGATES, serializer.buffer_mut()); + } +} + /// [`ESTree`] implementation for string slice. impl ESTree for str { fn serialize(&self, mut serializer: S) { - write_str(self, serializer.buffer_mut()); + write_str(self, &ESCAPE, serializer.buffer_mut()); } } /// [`ESTree`] implementation for `String`. impl ESTree for String { - fn serialize(&self, mut serializer: S) { - write_str(self.as_str(), serializer.buffer_mut()); + fn serialize(&self, serializer: S) { + self.as_str().serialize(serializer); } } @@ -46,7 +72,8 @@ enum Escape { RR = b'r', // \x0D QU = b'"', // \x22 BS = b'\\', // \x5C - UU = b'u', // \x00...\x1F except the ones above + LO = LOSSY_REPLACEMENT_CHAR_FIRST_BYTE, + UU = b'u', // \x00...\x1F except the ones above } /// Lookup table of escape sequences. A value of `b'x'` at index `i` means that byte `i` @@ -54,7 +81,12 @@ enum Escape { /// /// A value of `UU` means that byte is escaped as `\u00xx`, where `xx` is the hex code of the byte. /// e.g. `0x1F` is output as `\u001F`. -static ESCAPE: [Escape; 256] = { +static ESCAPE: [Escape; 256] = create_table(Escape::__); + +/// Same as `ESCAPE` but with `Escape::LO` for byte 0xEF. +static ESCAPE_LONE_SURROGATES: [Escape; 256] = create_table(Escape::LO); + +const fn create_table(lo: Escape) -> [Escape; 256] { #[allow(clippy::enum_glob_use, clippy::allow_attributes)] use Escape::*; @@ -74,29 +106,73 @@ static ESCAPE: [Escape; 256] = { __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, lo, // E __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F ] -}; +} /// Write string to buffer. /// String is wrapped in `"`s, and with any characters which are not valid in JSON escaped. +// +// `#[inline(always)]` because this is a hot path, and to make compiler remove the code +// for handling lone surrogates when outputting a normal string (the common case). #[inline(always)] -fn write_str(s: &str, buffer: &mut CodeBuffer) { +fn write_str(s: &str, table: &[Escape; 256], buffer: &mut CodeBuffer) { buffer.print_ascii_byte(b'"'); let bytes = s.as_bytes(); let mut start = 0; - for (index, &byte) in bytes.iter().enumerate() { - let escape = ESCAPE[byte as usize]; + let mut iter = bytes.iter().enumerate(); + while let Some((index, &byte)) = iter.next() { + let escape = table[byte as usize]; if escape == Escape::__ { continue; } + // Handle lone surrogates + if table == &ESCAPE_LONE_SURROGATES && escape == Escape::LO { + let (_, &next1) = iter.next().unwrap(); + let (_, &next2) = iter.next().unwrap(); + if [next1, next2] == [LOSSY_REPLACEMENT_CHAR_BYTES[1], LOSSY_REPLACEMENT_CHAR_BYTES[2]] + { + // Lossy replacement character (U+FFFD) is used as an escape before lone surrogates, + // with the code point as 4 x hex characters after it. + let (_, &hex1) = iter.next().unwrap(); + let (_, &hex2) = iter.next().unwrap(); + let (_, &hex3) = iter.next().unwrap(); + let (_, &hex4) = iter.next().unwrap(); + + // Print the chunk upto before the lossy replacement character. + // SAFETY: 0xEF is always the start of a 3-byte unicode character. + // Therefore `index` must be on a UTF-8 character boundary. + unsafe { buffer.print_bytes_unchecked(&bytes[start..index]) }; + + if [hex1, hex2, hex3, hex4] == *b"fffd" { + // This is an actual lossy replacement character (not an escaped lone surrogate) + buffer.print_str("\u{FFFD}"); + } else { + // This is an escaped lone surrogate. + // Print `\uXXXX` where `XXXX` is hex characters. e.g. `\ud800`. + assert!((hex1 | hex2 | hex3 | hex3).is_ascii()); + buffer.print_str("\\u"); + // SAFETY: Just checked all 4 bytes are ASCII + unsafe { buffer.print_bytes_unchecked(&[hex1, hex2, hex3, hex4]) }; + } + + // Skip the 3 bytes of the lossy replacement character + 4 hex bytes. + // We checked that all 4 hex bytes are ASCII, so `start` is definitely left on + // a UTF-8 character boundary. + start = index + 7; + } else { + // Some other unicode character starting with 0xEF. Just continue the loop. + } + continue; + } + if start < index { // SAFETY: `bytes` is derived from a `&str`. - // `escape` is only non-zero for ASCII bytes. + // `escape` is only non-zero for ASCII bytes, except `Escape::LO` which is handled above. // Therefore current `index` must mark the end of a valid UTF8 character sequence. // `start` is either the start of string, or after an ASCII character, // therefore always the start of a valid UTF8 character sequence. @@ -181,7 +257,38 @@ mod tests { for (input, output) in cases { let mut serializer = CompactTSSerializer::new(); - input.serialize(&mut serializer); + input.to_string().serialize(&mut serializer); + let s = serializer.into_string(); + assert_eq!(&s, output); + } + } + + #[test] + fn serialize_json_safe_string() { + let cases = [("", r#""""#), ("a", r#""a""#), ("abc", r#""abc""#)]; + + for (input, output) in cases { + let mut serializer = CompactTSSerializer::new(); + JsonSafeString(input).serialize(&mut serializer); + let s = serializer.into_string(); + assert_eq!(&s, output); + } + } + + #[test] + fn serialize_lone_surrogates_string() { + let cases = [ + ("\u{FFFD}fffd", "\"\u{FFFD}\""), + ("_x_\u{FFFD}fffd_y_\u{FFFD}fffd_z_", "\"_x_\u{FFFD}_y_\u{FFFD}_z_\""), + ("\u{FFFD}d834\u{FFFD}d835", r#""\ud834\ud835""#), + ("_x_\u{FFFD}d834\u{FFFD}d835", r#""_x_\ud834\ud835""#), + ("\u{FFFD}d834\u{FFFD}d835_y_", r#""\ud834\ud835_y_""#), + ("_x_\u{FFFD}d834_y_\u{FFFD}d835_z_", r#""_x_\ud834_y_\ud835_z_""#), + ]; + + for (input, output) in cases { + let mut serializer = CompactTSSerializer::new(); + LoneSurrogatesString(input).serialize(&mut serializer); let s = serializer.into_string(); assert_eq!(&s, output); } diff --git a/crates/oxc_parser/src/js/expression.rs b/crates/oxc_parser/src/js/expression.rs index 7d8c04301de51..f5371284901ce 100644 --- a/crates/oxc_parser/src/js/expression.rs +++ b/crates/oxc_parser/src/js/expression.rs @@ -406,7 +406,7 @@ impl<'a> ParserImpl<'a> { } let value = self.cur_string(); let span = self.start_span(); - let lossy = self.cur_token().lossy; + let lone_surrogates = self.cur_token().lone_surrogates; self.bump_any(); let span = self.end_span(span); // SAFETY: @@ -415,7 +415,7 @@ impl<'a> ParserImpl<'a> { self.source_text.get_unchecked(span.start as usize..span.end as usize) }); let mut string_literal = self.ast.string_literal(span, value, Some(raw)); - string_literal.lossy = lossy; + string_literal.lone_surrogates = lone_surrogates; Ok(string_literal) } diff --git a/crates/oxc_parser/src/js/module.rs b/crates/oxc_parser/src/js/module.rs index 35526c1d74313..561ada51df3c5 100644 --- a/crates/oxc_parser/src/js/module.rs +++ b/crates/oxc_parser/src/js/module.rs @@ -503,7 +503,7 @@ impl<'a> ParserImpl<'a> { let literal = self.parse_literal_string()?; // ModuleExportName : StringLiteral // It is a Syntax Error if IsStringWellFormedUnicode(the SV of StringLiteral) is false. - if literal.lossy || !literal.is_string_well_formed_unicode() { + if literal.lone_surrogates || !literal.is_string_well_formed_unicode() { self.error(diagnostics::export_lone_surrogate(literal.span)); }; Ok(ModuleExportName::StringLiteral(literal)) diff --git a/crates/oxc_parser/src/lexer/string.rs b/crates/oxc_parser/src/lexer/string.rs index 10f471fb02f87..4078d603cb779 100644 --- a/crates/oxc_parser/src/lexer/string.rs +++ b/crates/oxc_parser/src/lexer/string.rs @@ -9,6 +9,17 @@ use super::{ search::{SafeByteMatchTable, byte_search, safe_byte_match_table}, }; +/// Convert `char` to UTF-8 bytes array. +const fn to_bytes(ch: char) -> [u8; N] { + let mut bytes = [0u8; N]; + ch.encode_utf8(&mut bytes); + bytes +} + +/// Lossy replacement character (U+FFFD) as UTF-8 bytes. +const LOSSY_REPLACEMENT_CHAR_BYTES: [u8; 3] = to_bytes('\u{FFFD}'); +const LOSSY_REPLACEMENT_CHAR_FIRST_BYTE: u8 = LOSSY_REPLACEMENT_CHAR_BYTES[0]; // 0xEF + const MIN_ESCAPED_STR_LEN: usize = 16; static DOUBLE_QUOTE_STRING_END_TABLE: SafeByteMatchTable = @@ -17,6 +28,17 @@ static DOUBLE_QUOTE_STRING_END_TABLE: SafeByteMatchTable = static SINGLE_QUOTE_STRING_END_TABLE: SafeByteMatchTable = safe_byte_match_table!(|b| matches!(b, b'\'' | b'\r' | b'\n' | b'\\')); +// Same as above, but with 1st byte of lossy replacement character added +static DOUBLE_QUOTE_ESCAPED_MATCH_TABLE: SafeByteMatchTable = safe_byte_match_table!(|b| matches!( + b, + b'"' | b'\r' | b'\n' | b'\\' | LOSSY_REPLACEMENT_CHAR_FIRST_BYTE +)); + +static SINGLE_QUOTE_ESCAPED_MATCH_TABLE: SafeByteMatchTable = safe_byte_match_table!(|b| matches!( + b, + b'\'' | b'\r' | b'\n' | b'\\' | LOSSY_REPLACEMENT_CHAR_FIRST_BYTE +)); + /// Macro to handle a string literal. /// /// # SAFETY @@ -25,7 +47,7 @@ static SINGLE_QUOTE_STRING_END_TABLE: SafeByteMatchTable = /// `$table` must be a `SafeByteMatchTable`. /// `$table` must only match `$delimiter`, '\', '\r' or '\n'. macro_rules! handle_string_literal { - ($lexer:ident, $delimiter:literal, $table:ident) => {{ + ($lexer:ident, $delimiter:literal, $table:ident, $escaped_table:ident) => {{ debug_assert!($delimiter.is_ascii()); if $lexer.context == LexerContext::JsxAttributeValue { @@ -58,7 +80,12 @@ macro_rules! handle_string_literal { Kind::Str } b'\\' => cold_branch(|| { - handle_string_literal_escape!($lexer, $delimiter, $table, after_opening_quote) + handle_string_literal_escape!( + $lexer, + $delimiter, + $escaped_table, + after_opening_quote + ) }), _ => { // Line break. This is impossible in valid JS, so cold path. @@ -99,7 +126,7 @@ macro_rules! handle_string_literal_escape { } // Consume bytes until reach end of string, line break, or another escape - let chunk_start = $lexer.source.position(); + let mut chunk_start = $lexer.source.position(); while let Some(b) = $lexer.peek_byte() { match b { b if !$table.matches(b) => { @@ -127,6 +154,27 @@ macro_rules! handle_string_literal_escape { str.push_str(chunk); continue 'outer; } + LOSSY_REPLACEMENT_CHAR_FIRST_BYTE => cold_branch(|| { + // If the string contains lone surrogates, the lossy replacement character (U+FFFD) + // is used as start of an escape sequence. + // So an actual lossy escape character has to be escaped too. + // Output it as `\u{FFFD}fffd`. + // Cold branch because this should be very rare in real-world code. + + // SAFETY: A byte is available, as we just peeked it, and it's 0xEF. + // 0xEF is always 1st byte of a 3-byte Unicode sequence, so safe to consume 3 bytes. + $lexer.source.next_byte_unchecked(); + let next1 = $lexer.source.next_byte_unchecked(); + let next2 = $lexer.source.next_byte_unchecked(); + if $lexer.token.lone_surrogates + && [next1, next2] == [LOSSY_REPLACEMENT_CHAR_BYTES[1], LOSSY_REPLACEMENT_CHAR_BYTES[2]] + { + let chunk = $lexer.source.str_from_pos_to_current(chunk_start); + str.push_str(chunk); + str.push_str("fffd"); + chunk_start = $lexer.source.position(); + } + }), _ => { // Line break. This is impossible in valid JS, so cold path. return cold_branch(|| { @@ -159,7 +207,14 @@ impl<'a> Lexer<'a> { pub(super) unsafe fn read_string_literal_double_quote(&mut self) -> Kind { // SAFETY: Caller guarantees next char is `"`, which is ASCII. // b'"' is an ASCII byte. `DOUBLE_QUOTE_STRING_END_TABLE` is a `SafeByteMatchTable`. - unsafe { handle_string_literal!(self, b'"', DOUBLE_QUOTE_STRING_END_TABLE) } + unsafe { + handle_string_literal!( + self, + b'"', + DOUBLE_QUOTE_STRING_END_TABLE, + DOUBLE_QUOTE_ESCAPED_MATCH_TABLE + ) + } } /// Read string literal delimited with `'`. @@ -168,7 +223,14 @@ impl<'a> Lexer<'a> { pub(super) unsafe fn read_string_literal_single_quote(&mut self) -> Kind { // SAFETY: Caller guarantees next char is `'`, which is ASCII. // b'\'' is an ASCII byte. `SINGLE_QUOTE_STRING_END_TABLE` is a `SafeByteMatchTable`. - unsafe { handle_string_literal!(self, b'\'', SINGLE_QUOTE_STRING_END_TABLE) } + unsafe { + handle_string_literal!( + self, + b'\'', + SINGLE_QUOTE_STRING_END_TABLE, + SINGLE_QUOTE_ESCAPED_MATCH_TABLE + ) + } } /// Save the string if it is escaped diff --git a/crates/oxc_parser/src/lexer/token.rs b/crates/oxc_parser/src/lexer/token.rs index 21a0bd6879e46..0ac7f60244047 100644 --- a/crates/oxc_parser/src/lexer/token.rs +++ b/crates/oxc_parser/src/lexer/token.rs @@ -26,8 +26,8 @@ pub struct Token { /// [Lexer::escaped_templates]: [super::Lexer::escaped_templates] pub escaped: bool, - /// True if a string contains lossy replacement character (U+FFFD). - pub lossy: bool, + /// True if a string contains lone surrogates. + pub lone_surrogates: bool, /// True if for numeric literal tokens that contain separator characters (`_`). /// diff --git a/crates/oxc_parser/src/lexer/unicode.rs b/crates/oxc_parser/src/lexer/unicode.rs index 131d43151036b..20667a9111044 100644 --- a/crates/oxc_parser/src/lexer/unicode.rs +++ b/crates/oxc_parser/src/lexer/unicode.rs @@ -1,3 +1,7 @@ +use std::{borrow::Cow, fmt::Write}; + +use cow_utils::CowUtils; + use oxc_allocator::String; use oxc_syntax::identifier::{ CR, FF, LF, LS, PS, TAB, VT, is_identifier_part, is_identifier_start, @@ -8,13 +12,19 @@ use crate::diagnostics; use super::{Kind, Lexer, Span}; -enum SurrogatePair { - // valid \u Hex4Digits \u Hex4Digits - Astral(u32), - // valid \u Hex4Digits - CodePoint(u32), - // invalid \u Hex4Digits \u Hex4Digits - HighLow(u32, u32), +/// A Unicode escape sequence. +/// +/// `\u Hex4Digits`, `\u Hex4Digits \u Hex4Digits`, or `\u{ HexDigits }`. +enum UnicodeEscape { + // `\u Hex4Digits` or `\u{ HexDigits }`, which forms a valid Unicode code point. + // Char cannot be in range 0xD800..=0xDFFF. + CodePoint(char), + // `\u Hex4Digits \u Hex4Digits`, which forms a valid Unicode astral code point. + // Char is in the range 0x10000..=0x10FFFF. + SurrogatePair(char), + // `\u Hex4Digits` or `\u{ HexDigits }`, which forms an invalid Unicode code point. + // Code unit is in the range 0xD800..=0xDFFF. + LoneSurrogate(u32), } impl<'a> Lexer<'a> { @@ -69,7 +79,7 @@ impl<'a> Lexer<'a> { self.consume_char(); self.unicode_code_point() } - _ => self.surrogate_pair(), + _ => self.unicode_code_unit(), }; let Some(value) = value else { @@ -80,20 +90,12 @@ impl<'a> Lexer<'a> { // For Identifiers, surrogate pair is an invalid grammar, e.g. `var \uD800\uDEA7`. let ch = match value { - SurrogatePair::Astral(..) | SurrogatePair::HighLow(..) => { + UnicodeEscape::CodePoint(ch) => ch, + UnicodeEscape::SurrogatePair(_) | UnicodeEscape::LoneSurrogate(_) => { let range = Span::new(start, self.offset()); self.error(diagnostics::unicode_escape_sequence(range)); return; } - SurrogatePair::CodePoint(code_point) => { - if let Ok(ch) = char::try_from(code_point) { - ch - } else { - let range = Span::new(start, self.offset()); - self.error(diagnostics::unicode_escape_sequence(range)); - return; - } - } }; let is_valid = @@ -121,7 +123,7 @@ impl<'a> Lexer<'a> { self.consume_char(); self.unicode_code_point() } - _ => self.surrogate_pair(), + _ => self.unicode_code_unit(), }; let Some(value) = value else { @@ -132,32 +134,47 @@ impl<'a> Lexer<'a> { // For strings and templates, surrogate pairs are valid grammar, e.g. `"\uD83D\uDE00" === 😀`. match value { - SurrogatePair::CodePoint(code_point) | SurrogatePair::Astral(code_point) => { - if let Ok(ch) = char::try_from(code_point) { - text.push(ch); - } else { - // Turns lone surrogate into lossy replacement character (U+FFFD). - // A lone surrogate '\u{df06}' is not a valid UTF8 string. - text.push_str("\u{FFFD}"); - self.token.lossy = true; - } + UnicodeEscape::CodePoint(ch) | UnicodeEscape::SurrogatePair(ch) => { + text.push(ch); } - SurrogatePair::HighLow(_high, _low) => { - text.push_str("\u{FFFD}\u{FFFD}"); - self.token.lossy = true; + UnicodeEscape::LoneSurrogate(code_point) => { + self.string_lone_surrogate(code_point, text); } } } + /// Lone surrogate found in string. + fn string_lone_surrogate(&mut self, code_point: u32, text: &mut String<'a>) { + debug_assert!(code_point <= 0xFFFF); + + if !self.token.lone_surrogates { + self.token.lone_surrogates = true; + + // We use `\u{FFFD}` (the lossy replacement character) as a marker indicating the start + // of a lone surrogate. e.g. `\u{FFFD}d800` (which will be output as `\ud800`). + // So we need to escape any actual lossy replacement characters in the string so far. + // + // This could be more efficient, avoiding allocating a temporary `String`. + // But strings containing both lone surrogates and lossy replacement characters + // should be vanishingly rare, so don't bother. + if let Cow::Owned(replaced) = text.cow_replace("\u{FFFD}", "\u{FFFD}fffd") { + *text = String::from_str_in(&replaced, self.allocator); + } + } + + // Encode lone surrogate as `\u{FFFD}XXXX` where XXXX is the code point as hex + write!(text, "\u{FFFD}{code_point:04x}").unwrap(); + } + /// Decode unicode code point (`\u{ HexBytes }`). /// /// The opening `\u{` must already have been consumed before calling this method. - fn unicode_code_point(&mut self) -> Option { + fn unicode_code_point(&mut self) -> Option { let value = self.code_point()?; if !self.next_ascii_byte_eq(b'}') { return None; } - Some(SurrogatePair::CodePoint(value)) + Some(value) } fn hex_4_digits(&mut self) -> Option { @@ -195,7 +212,7 @@ impl<'a> Lexer<'a> { Some(u32::from(value)) } - fn code_point(&mut self) -> Option { + fn code_point(&mut self) -> Option { let mut value = self.hex_digit()?; while let Some(next) = self.hex_digit() { value = (value << 4) | next; @@ -203,38 +220,78 @@ impl<'a> Lexer<'a> { return None; } } - Some(value) + + match char::from_u32(value) { + Some(ch) => Some(UnicodeEscape::CodePoint(ch)), + None => Some(UnicodeEscape::LoneSurrogate(value)), + } } - /// Surrogate pairs - /// See background info: + /// Unicode code unit (`\uXXXX`). + /// + /// The opening `\u` must already have been consumed before calling this method. + /// + /// See background info on surrogate pairs: /// * `https://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae` /// * `https://mathiasbynens.be/notes/javascript-identifiers-es6` - fn surrogate_pair(&mut self) -> Option { + fn unicode_code_unit(&mut self) -> Option { + const MIN_HIGH: u32 = 0xD800; + const MAX_HIGH: u32 = 0xDBFF; + const MIN_LOW: u32 = 0xDC00; + const MAX_LOW: u32 = 0xDFFF; + + // `https://tc39.es/ecma262/#sec-utf16decodesurrogatepair` + #[inline] + const fn pair_to_code_point(high: u32, low: u32) -> u32 { + (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000 + } + + const _: () = { + assert!(char::from_u32(pair_to_code_point(MIN_HIGH, MIN_LOW)).is_some()); + assert!(char::from_u32(pair_to_code_point(MIN_HIGH, MAX_LOW)).is_some()); + assert!(char::from_u32(pair_to_code_point(MAX_HIGH, MIN_LOW)).is_some()); + assert!(char::from_u32(pair_to_code_point(MAX_HIGH, MAX_LOW)).is_some()); + }; + let high = self.hex_4_digits()?; + if let Some(ch) = char::from_u32(high) { + return Some(UnicodeEscape::CodePoint(ch)); + } + // The first code unit of a surrogate pair is always in the range from 0xD800 to 0xDBFF, // and is called a high surrogate or a lead surrogate. - let is_pair = - (0xD800..=0xDBFF).contains(&high) && self.peek_2_bytes() == Some([b'\\', b'u']); + // Note: `high` must be >= `MIN_HIGH`, otherwise `char::from_u32` would have returned `Some`, + // and already exited. + debug_assert!(high >= MIN_HIGH); + let is_pair = high <= MAX_HIGH && self.peek_2_bytes() == Some([b'\\', b'u']); if !is_pair { - return Some(SurrogatePair::CodePoint(high)); + return Some(UnicodeEscape::LoneSurrogate(high)); } - // We checked above that next 2 chars are `\u` - self.consume_2_chars(); + let before_second = self.source.position(); + + // SAFETY: We checked above that next 2 chars are `\u` + unsafe { + self.source.next_byte_unchecked(); + self.source.next_byte_unchecked(); + } let low = self.hex_4_digits()?; // The second code unit of a surrogate pair is always in the range from 0xDC00 to 0xDFFF, // and is called a low surrogate or a trail surrogate. - if !(0xDC00..=0xDFFF).contains(&low) { - return Some(SurrogatePair::HighLow(high, low)); + // If this isn't a valid pair, rewind to before the 2nd, and return the first only. + // The 2nd could be the first part of a valid pair. + if !(MIN_LOW..=MAX_LOW).contains(&low) { + self.source.set_position(before_second); + return Some(UnicodeEscape::LoneSurrogate(high)); } - // `https://tc39.es/ecma262/#sec-utf16decodesurrogatepair` - let astral_code_point = (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000; - - Some(SurrogatePair::Astral(astral_code_point)) + let code_point = pair_to_code_point(high, low); + // SAFETY: `high` and `low` have been checked to be in ranges which always yield a `code_point` + // which is a valid `char` + let ch = unsafe { char::from_u32_unchecked(code_point) }; + Some(UnicodeEscape::SurrogatePair(ch)) } // EscapeSequence :: diff --git a/napi/parser/deserialize-js.js b/napi/parser/deserialize-js.js index 5d619163be434..681690ec1a428 100644 --- a/napi/parser/deserialize-js.js +++ b/napi/parser/deserialize-js.js @@ -1061,14 +1061,16 @@ function deserializeNumericLiteral(pos) { } function deserializeStringLiteral(pos) { - const raw = deserializeOptionStr(pos + 24); - const lossy = deserializeBool(pos + 40); + let value = deserializeStr(pos + 8); + if (deserializeBool(pos + 40)) { + value = value.replace(/\uFFFD(.{4})/g, (_, hex) => String.fromCodePoint(parseInt(hex, 16))); + } return { type: 'Literal', start: deserializeU32(pos), end: deserializeU32(pos + 4), - value: (lossy && raw !== null) ? (0, eval)(raw) : deserializeStr(pos + 8), - raw, + value, + raw: deserializeOptionStr(pos + 24), }; } diff --git a/napi/parser/deserialize-ts.js b/napi/parser/deserialize-ts.js index 9a4cb15d20b7e..dca1bf8992122 100644 --- a/napi/parser/deserialize-ts.js +++ b/napi/parser/deserialize-ts.js @@ -1126,14 +1126,16 @@ function deserializeNumericLiteral(pos) { } function deserializeStringLiteral(pos) { - const raw = deserializeOptionStr(pos + 24); - const lossy = deserializeBool(pos + 40); + let value = deserializeStr(pos + 8); + if (deserializeBool(pos + 40)) { + value = value.replace(/\uFFFD(.{4})/g, (_, hex) => String.fromCodePoint(parseInt(hex, 16))); + } return { type: 'Literal', start: deserializeU32(pos), end: deserializeU32(pos + 4), - value: (lossy && raw !== null) ? (0, eval)(raw) : deserializeStr(pos + 8), - raw, + value, + raw: deserializeOptionStr(pos + 24), }; } diff --git a/tasks/coverage/snapshots/estree_test262.snap b/tasks/coverage/snapshots/estree_test262.snap index 31d5fe6c077d9..9551cc82385c3 100644 --- a/tasks/coverage/snapshots/estree_test262.snap +++ b/tasks/coverage/snapshots/estree_test262.snap @@ -2,37 +2,13 @@ commit: bc5c1417 estree_test262 Summary: AST Parsed : 44047/44047 (100.00%) -Positive Passed: 44014/44047 (99.93%) -Mismatch: tasks/coverage/test262/test/built-ins/Array/prototype/concat/Array.prototype.concat_spreadable-string-wrapper.js -Mismatch: tasks/coverage/test262/test/built-ins/JSON/stringify/value-string-escape-unicode.js -Mismatch: tasks/coverage/test262/test/built-ins/RegExp/dotall/with-dotall-unicode.js -Mismatch: tasks/coverage/test262/test/built-ins/RegExp/dotall/with-dotall.js -Mismatch: tasks/coverage/test262/test/built-ins/RegExp/dotall/without-dotall-unicode.js -Mismatch: tasks/coverage/test262/test/built-ins/RegExp/dotall/without-dotall.js -Mismatch: tasks/coverage/test262/test/built-ins/RegExp/escape/escaped-surrogates.js -Mismatch: tasks/coverage/test262/test/built-ins/RegExp/regexp-modifiers/add-dotAll.js -Mismatch: tasks/coverage/test262/test/built-ins/RegExp/regexp-modifiers/changing-dotAll-flag-does-not-affect-dotAll-modifier.js -Mismatch: tasks/coverage/test262/test/built-ins/RegExp/regexp-modifiers/nesting-add-dotAll-within-remove-dotAll.js -Mismatch: tasks/coverage/test262/test/built-ins/RegExp/regexp-modifiers/nesting-remove-dotAll-within-add-dotAll.js -Mismatch: tasks/coverage/test262/test/built-ins/RegExp/regexp-modifiers/remove-dotAll.js -Mismatch: tasks/coverage/test262/test/built-ins/String/prototype/at/returns-code-unit.js -Mismatch: tasks/coverage/test262/test/built-ins/String/prototype/codePointAt/return-first-code-unit.js -Mismatch: tasks/coverage/test262/test/built-ins/String/prototype/codePointAt/return-single-code-unit.js -Mismatch: tasks/coverage/test262/test/built-ins/String/prototype/isWellFormed/returns-boolean.js -Mismatch: tasks/coverage/test262/test/built-ins/String/prototype/match/regexp-prototype-match-v-u-flag.js -Mismatch: tasks/coverage/test262/test/built-ins/String/prototype/padEnd/normal-operation.js -Mismatch: tasks/coverage/test262/test/built-ins/String/prototype/padStart/normal-operation.js -Mismatch: tasks/coverage/test262/test/built-ins/String/prototype/toWellFormed/returns-well-formed-string.js -Mismatch: tasks/coverage/test262/test/built-ins/StringIteratorPrototype/next/next-iteration-surrogate-pairs.js -Mismatch: tasks/coverage/test262/test/intl402/NumberFormat/prototype/format/format-non-finite-numbers.js +Positive Passed: 44038/44047 (99.98%) Mismatch: tasks/coverage/test262/test/language/expressions/assignment/fn-name-lhs-cover.js Mismatch: tasks/coverage/test262/test/language/expressions/assignment/target-cover-id.js Mismatch: tasks/coverage/test262/test/language/expressions/postfix-decrement/target-cover-id.js Mismatch: tasks/coverage/test262/test/language/expressions/postfix-increment/target-cover-id.js Mismatch: tasks/coverage/test262/test/language/expressions/prefix-decrement/target-cover-id.js Mismatch: tasks/coverage/test262/test/language/expressions/prefix-increment/target-cover-id.js -Mismatch: tasks/coverage/test262/test/language/literals/regexp/named-groups/invalid-lone-surrogate-groupname.js -Mismatch: tasks/coverage/test262/test/language/literals/regexp/u-surrogate-pairs-atom-escape-decimal.js Mismatch: tasks/coverage/test262/test/language/statements/for-in/head-lhs-cover.js Mismatch: tasks/coverage/test262/test/language/statements/for-of/head-lhs-async-parens.js Mismatch: tasks/coverage/test262/test/language/statements/for-of/head-lhs-cover.js diff --git a/tasks/coverage/snapshots/estree_typescript.snap b/tasks/coverage/snapshots/estree_typescript.snap index a16ef821a956a..ce83f499cbda7 100644 --- a/tasks/coverage/snapshots/estree_typescript.snap +++ b/tasks/coverage/snapshots/estree_typescript.snap @@ -4717,10 +4717,10 @@ Mismatch: tasks/coverage/typescript/tests/cases/conformance/es6/unicodeExtendedE Mismatch: tasks/coverage/typescript/tests/cases/conformance/es6/unicodeExtendedEscapes/unicodeExtendedEscapesInStrings08.ts Mismatch: tasks/coverage/typescript/tests/cases/conformance/es6/unicodeExtendedEscapes/unicodeExtendedEscapesInStrings09.ts tasks/coverage/typescript/tests/cases/conformance/es6/unicodeExtendedEscapes/unicodeExtendedEscapesInStrings10.ts -serde_json::from_str(oxc_json) error: invalid escape at line 30 column 28 +serde_json::from_str(oxc_json) error: unexpected end of hex escape at line 30 column 29 tasks/coverage/typescript/tests/cases/conformance/es6/unicodeExtendedEscapes/unicodeExtendedEscapesInStrings11.ts -serde_json::from_str(oxc_json) error: invalid escape at line 30 column 28 +serde_json::from_str(oxc_json) error: lone leading surrogate in hex escape at line 30 column 28 Mismatch: tasks/coverage/typescript/tests/cases/conformance/es6/unicodeExtendedEscapes/unicodeExtendedEscapesInStrings13.ts Mismatch: tasks/coverage/typescript/tests/cases/conformance/es6/unicodeExtendedEscapes/unicodeExtendedEscapesInStrings15.ts diff --git a/tasks/coverage/snapshots/parser_test262.snap b/tasks/coverage/snapshots/parser_test262.snap index 06466b0db792c..ef0fef74d1bf5 100644 --- a/tasks/coverage/snapshots/parser_test262.snap +++ b/tasks/coverage/snapshots/parser_test262.snap @@ -24775,7 +24775,7 @@ Expect to Parse: tasks/coverage/test262/test/language/statements/function/S14_A5 24 │ ╰──── - × Duplicated export '�' + × Duplicated export '�d83c' ╭─[test262/test/language/module-code/early-export-ill-formed-string.js:21:17] 20 │ // 🌙 is '\uD83C\uDF19' 21 │ export {Moon as "\uD83C",} from "./early-export-ill-formed-string.js"; diff --git a/tasks/coverage/src/tools/estree.rs b/tasks/coverage/src/tools/estree.rs index f99a130524e78..6adcb010adbe2 100644 --- a/tasks/coverage/src/tools/estree.rs +++ b/tasks/coverage/src/tools/estree.rs @@ -61,6 +61,7 @@ impl Case for EstreeTest262Case { // We don't filter them out because they are genuine test fails, but leaving this list here so // can uncomment this block when debugging any new test failures, to filter out "known bad". /* + #[expect(clippy::items_after_statements)] static IGNORE_PATHS: &[&str] = &[ // Missing `ParenthesizedExpression` on left side of assignment. // Oxc's parser does not support this, and we do not intend to fix. @@ -74,46 +75,6 @@ impl Case for EstreeTest262Case { "test262/test/language/statements/for-in/head-lhs-cover.js", "test262/test/language/statements/for-of/head-lhs-async-parens.js", "test262/test/language/statements/for-of/head-lhs-cover.js", - - // Lone surrogates in strings. - // We cannot pass these tests at present, as Oxc's parser does not handle them correctly. - // https://github.com/oxc-project/oxc/issues/3526#issuecomment-2650260735 - "test262/test/annexB/built-ins/RegExp/prototype/compile/pattern-string-u.js", - "test262/test/annexB/built-ins/String/prototype/substr/surrogate-pairs.js", - "test262/test/built-ins/Array/prototype/concat/Array.prototype.concat_spreadable-string-wrapper.js", - "test262/test/built-ins/JSON/stringify/value-string-escape-unicode.js", - "test262/test/built-ins/RegExp/dotall/with-dotall-unicode.js", - "test262/test/built-ins/RegExp/dotall/with-dotall.js", - "test262/test/built-ins/RegExp/dotall/without-dotall-unicode.js", - "test262/test/built-ins/RegExp/dotall/without-dotall.js", - "test262/test/built-ins/RegExp/escape/escaped-surrogates.js", - "test262/test/built-ins/RegExp/named-groups/non-unicode-property-names-invalid.js", - "test262/test/built-ins/RegExp/named-groups/unicode-property-names-invalid.js", - "test262/test/built-ins/RegExp/prototype/Symbol.replace/coerce-unicode.js", - "test262/test/built-ins/RegExp/prototype/exec/u-captured-value.js", - "test262/test/built-ins/RegExp/regexp-modifiers/add-dotAll.js", - "test262/test/built-ins/RegExp/regexp-modifiers/changing-dotAll-flag-does-not-affect-dotAll-modifier.js", - "test262/test/built-ins/RegExp/regexp-modifiers/nesting-add-dotAll-within-remove-dotAll.js", - "test262/test/built-ins/RegExp/regexp-modifiers/nesting-remove-dotAll-within-add-dotAll.js", - "test262/test/built-ins/RegExp/regexp-modifiers/remove-dotAll.js", - "test262/test/built-ins/String/prototype/at/returns-code-unit.js", - "test262/test/built-ins/String/prototype/codePointAt/return-first-code-unit.js", - "test262/test/built-ins/String/prototype/codePointAt/return-single-code-unit.js", - "test262/test/built-ins/String/prototype/isWellFormed/returns-boolean.js", - "test262/test/built-ins/String/prototype/match/regexp-prototype-match-v-u-flag.js", - "test262/test/built-ins/String/prototype/padEnd/normal-operation.js", - "test262/test/built-ins/String/prototype/padStart/normal-operation.js", - "test262/test/built-ins/String/prototype/toWellFormed/returns-well-formed-string.js", - "test262/test/built-ins/StringIteratorPrototype/next/next-iteration-surrogate-pairs.js", - "test262/test/intl402/NumberFormat/prototype/format/format-non-finite-numbers.js", - "test262/test/intl402/Segmenter/prototype/segment/containing/breakable-input.js", - "test262/test/intl402/Segmenter/prototype/segment/containing/unbreakable-input.js", - "test262/test/intl402/Segmenter/prototype/segment/containing/zero-index.js", - "test262/test/language/literals/regexp/named-groups/invalid-lone-surrogate-groupname.js", - "test262/test/language/literals/regexp/u-astral.js", - "test262/test/language/literals/regexp/u-surrogate-pairs-atom-char-class.js", - "test262/test/language/literals/regexp/u-surrogate-pairs-atom-escape-decimal.js", - "test262/test/language/statements/for-of/string-astral-truncated.js", ]; let path = &*self.path().to_string_lossy();