Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions crates/oxc_ast/src/ast/js.rs
Original file line number Diff line number Diff line change
Expand Up @@ -461,8 +461,17 @@ pub struct TaggedTemplateExpression<'a> {
#[generate_derive(CloneIn, Dummy, TakeIn, GetSpan, GetSpanMut, ContentEq, ESTree)]
pub struct TemplateElement<'a> {
pub span: Span,
#[estree(via = TemplateElementValue)]
pub value: TemplateElementValue<'a>,
pub tail: bool,
/// The template element contains lone surrogates.
///
/// `value.cooked` is encoded using `\u{FFFD}` (the lossy replacement character) as an escape character.
/// Lone surrogates are encoded as `\u{FFFD}XXXX`, where `XXXX` is the code unit in hex.
/// The lossy escape character itself is encoded as `\u{FFFD}fffd`.
#[builder(default)]
#[estree(skip)]
pub lone_surrogates: bool,
}

/// See [template-strings-cooked-vs-raw](https://exploringjs.com/js/book/ch_template-literals.html#template-strings-cooked-vs-raw)
Expand Down
2 changes: 2 additions & 0 deletions crates/oxc_ast/src/generated/assert_layouts.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ const _: () = {
assert!(offset_of!(TemplateElement, span) == 0);
assert!(offset_of!(TemplateElement, value) == 8);
assert!(offset_of!(TemplateElement, tail) == 40);
assert!(offset_of!(TemplateElement, lone_surrogates) == 41);

assert!(size_of::<TemplateElementValue>() == 32);
assert!(align_of::<TemplateElementValue>() == 8);
Expand Down Expand Up @@ -1496,6 +1497,7 @@ const _: () = {
assert!(offset_of!(TemplateElement, span) == 0);
assert!(offset_of!(TemplateElement, value) == 8);
assert!(offset_of!(TemplateElement, tail) == 24);
assert!(offset_of!(TemplateElement, lone_surrogates) == 25);

assert!(size_of::<TemplateElementValue>() == 16);
assert!(align_of::<TemplateElementValue>() == 4);
Expand Down
45 changes: 44 additions & 1 deletion crates/oxc_ast/src/generated/ast_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1900,7 +1900,7 @@ impl<'a> AstBuilder<'a> {
value: TemplateElementValue<'a>,
tail: bool,
) -> TemplateElement<'a> {
TemplateElement { span, value, tail }
TemplateElement { span, value, tail, lone_surrogates: Default::default() }
}

/// Build a [`TemplateElement`], and store it in the memory arena.
Expand All @@ -1921,6 +1921,49 @@ impl<'a> AstBuilder<'a> {
Box::new_in(self.template_element(span, value, tail), self.allocator)
}

/// Build a [`TemplateElement`] with `lone_surrogates`.
///
/// If you want the built node to be allocated in the memory arena, use [`AstBuilder::alloc_template_element_with_lone_surrogates`] instead.
///
/// ## Parameters
/// * `span`: The [`Span`] covering this node
/// * `value`
/// * `tail`
/// * `lone_surrogates`: The template element contains lone surrogates.
#[inline]
pub fn template_element_with_lone_surrogates(
self,
span: Span,
value: TemplateElementValue<'a>,
tail: bool,
lone_surrogates: bool,
) -> TemplateElement<'a> {
TemplateElement { span, value, tail, lone_surrogates }
}

/// Build a [`TemplateElement`] with `lone_surrogates`, and store it in the memory arena.
///
/// Returns a [`Box`] containing the newly-allocated node. If you want a stack-allocated node, use [`AstBuilder::template_element_with_lone_surrogates`] instead.
///
/// ## Parameters
/// * `span`: The [`Span`] covering this node
/// * `value`
/// * `tail`
/// * `lone_surrogates`: The template element contains lone surrogates.
#[inline]
pub fn alloc_template_element_with_lone_surrogates(
self,
span: Span,
value: TemplateElementValue<'a>,
tail: bool,
lone_surrogates: bool,
) -> Box<'a, TemplateElement<'a>> {
Box::new_in(
self.template_element_with_lone_surrogates(span, value, tail, lone_surrogates),
self.allocator,
)
}

/// Build a [`MemberExpression::ComputedMemberExpression`].
///
/// This node contains a [`ComputedMemberExpression`] that will be stored in the memory arena.
Expand Down
2 changes: 2 additions & 0 deletions crates/oxc_ast/src/generated/derive_clone_in.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1097,6 +1097,7 @@ impl<'new_alloc> CloneIn<'new_alloc> for TemplateElement<'_> {
span: CloneIn::clone_in(&self.span, allocator),
value: CloneIn::clone_in(&self.value, allocator),
tail: CloneIn::clone_in(&self.tail, allocator),
lone_surrogates: CloneIn::clone_in(&self.lone_surrogates, allocator),
}
}

Expand All @@ -1105,6 +1106,7 @@ impl<'new_alloc> CloneIn<'new_alloc> for TemplateElement<'_> {
span: CloneIn::clone_in_with_semantic_ids(&self.span, allocator),
value: CloneIn::clone_in_with_semantic_ids(&self.value, allocator),
tail: CloneIn::clone_in_with_semantic_ids(&self.tail, allocator),
lone_surrogates: CloneIn::clone_in_with_semantic_ids(&self.lone_surrogates, allocator),
}
}
}
Expand Down
1 change: 1 addition & 0 deletions crates/oxc_ast/src/generated/derive_content_eq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,7 @@ impl ContentEq for TemplateElement<'_> {
fn content_eq(&self, other: &Self) -> bool {
ContentEq::content_eq(&self.value, &other.value)
&& ContentEq::content_eq(&self.tail, &other.tail)
&& ContentEq::content_eq(&self.lone_surrogates, &other.lone_surrogates)
}
}

Expand Down
1 change: 1 addition & 0 deletions crates/oxc_ast/src/generated/derive_dummy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,7 @@ impl<'a> Dummy<'a> for TemplateElement<'a> {
span: Dummy::dummy(allocator),
value: Dummy::dummy(allocator),
tail: Dummy::dummy(allocator),
lone_surrogates: Dummy::dummy(allocator),
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion crates/oxc_ast/src/generated/derive_estree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,7 @@ impl ESTree for TemplateElement<'_> {
state.serialize_field("type", &JsonSafeString("TemplateElement"));
state.serialize_field("start", &self.span.start);
state.serialize_field("end", &self.span.end);
state.serialize_field("value", &self.value);
state.serialize_field("value", &crate::serialize::TemplateElementValue(self));
state.serialize_field("tail", &self.tail);
state.end();
}
Expand Down
46 changes: 46 additions & 0 deletions crates/oxc_ast/src/serialize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,52 @@ impl ESTree for RegExpFlagsConverter<'_> {
}
}

/// Serializer for `value` field of `TemplateElement`.
///
/// Handle when `lone_surrogates` flag is set, indicating the cooked string contains lone surrogates.
#[ast_meta]
#[estree(
ts_type = "TemplateElementValue",
raw_deser = r#"
let value = DESER[TemplateElementValue](POS_OFFSET.value);
if (value.cooked !== null && DESER[bool](POS_OFFSET.lone_surrogates)) {
value.cooked = value.cooked
.replace(/\uFFFD(.{4})/g, (_, hex) => String.fromCodePoint(parseInt(hex, 16)));
}
value
"#
)]
pub struct TemplateElementValue<'a, 'b>(pub &'b TemplateElement<'a>);

impl ESTree for TemplateElementValue<'_, '_> {
fn serialize<S: Serializer>(&self, serializer: S) {
let element = self.0;
#[expect(clippy::if_not_else)]
if !element.lone_surrogates {
element.value.serialize(serializer);
} else {
// String contains lone surrogates
self.serialize_lone_surrogates(serializer);
}
}
}

impl TemplateElementValue<'_, '_> {
#[cold]
#[inline(never)]
fn serialize_lone_surrogates<S: Serializer>(&self, serializer: S) {
let value = &self.0.value;

let mut state = serializer.serialize_struct();
state.serialize_field("raw", &value.raw);

let cooked = value.cooked.as_ref().map(|cooked| LoneSurrogatesString(cooked.as_str()));
state.serialize_field("cooked", &cooked);

state.end();
}
}

// --------------------
// Various
// --------------------
Expand Down
4 changes: 3 additions & 1 deletion crates/oxc_parser/src/js/expression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -540,6 +540,7 @@ impl<'a> ParserImpl<'a> {
// `cooked = None` when template literal has invalid escape sequence
// This is matched by `is_valid_escape_sequence` in `Lexer::read_template_literal`
let cooked = self.cur_template_string();
let lone_surrogates = self.cur_token().lone_surrogates;

let cur_src = self.cur_src();
let raw = &cur_src[1..cur_src.len() - end_offset as usize];
Expand All @@ -560,10 +561,11 @@ impl<'a> ParserImpl<'a> {
}

let tail = matches!(cur_kind, Kind::TemplateTail | Kind::NoSubstitutionTemplate);
self.ast.template_element(
self.ast.template_element_with_lone_surrogates(
span,
TemplateElementValue { raw, cooked: cooked.map(Atom::from) },
tail,
lone_surrogates,
)
}

Expand Down
66 changes: 59 additions & 7 deletions crates/oxc_parser/src/lexer/template.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::cmp::max;
use std::{cmp::max, str};

use oxc_allocator::String;

Expand All @@ -11,9 +11,26 @@ use super::{

const MIN_ESCAPED_TEMPLATE_LIT_LEN: usize = 16;

/// Convert `char` to UTF-8 bytes array.
const fn to_bytes<const N: usize>(ch: char) -> [u8; N] {
let mut bytes = [0u8; N];
ch.encode_utf8(&mut bytes);
bytes
}

/// Lossy replacement character (U+FFFD) as UTF-8 bytes.
const LOSSY_REPLACEMENT_CHAR_BYTES: [u8; 3] = to_bytes('\u{FFFD}');
const LOSSY_REPLACEMENT_CHAR_FIRST_BYTE: u8 = LOSSY_REPLACEMENT_CHAR_BYTES[0];
const _: () = assert!(LOSSY_REPLACEMENT_CHAR_FIRST_BYTE == 0xEF);

static TEMPLATE_LITERAL_TABLE: SafeByteMatchTable =
safe_byte_match_table!(|b| matches!(b, b'$' | b'`' | b'\r' | b'\\'));

// Same as above, but with 1st byte of lossy replacement character added
static TEMPLATE_LITERAL_ESCAPED_MATCH_TABLE: SafeByteMatchTable = safe_byte_match_table!(
|b| matches!(b, b'$' | b'`' | b'\r' | b'\\' | LOSSY_REPLACEMENT_CHAR_FIRST_BYTE)
);

/// 12.8.6 Template Literal Lexical Components
impl<'a> Lexer<'a> {
/// Read template literal component.
Expand Down Expand Up @@ -206,7 +223,7 @@ impl<'a> Lexer<'a> {

byte_search! {
lexer: self,
table: TEMPLATE_LITERAL_TABLE,
table: TEMPLATE_LITERAL_ESCAPED_MATCH_TABLE,
start: pos,
continue_if: (next_byte, pos) {
if next_byte == b'$' {
Expand Down Expand Up @@ -238,7 +255,8 @@ impl<'a> Lexer<'a> {
cold_branch(|| true)
}
} else {
// Next byte is '`', `\r` or `\`. Add chunk up to before this char to `str`.
// Next byte is '`', `\r`, `\`, or first byte of lossy replacement character.
// Add chunk up to before this char to `str`.
// SAFETY: Caller guarantees `chunk_start` is not after `pos` at start of
// this function. `pos` only increases during searching.
// Where `chunk_start` is updated, it's always before or equal to `pos`.
Expand Down Expand Up @@ -293,10 +311,7 @@ impl<'a> Lexer<'a> {
// Continue searching
true
}
_ => {
// `TEMPLATE_LITERAL_TABLE` only matches `$`, '`', `\r` and `\`
debug_assert!(next_byte == b'\\');

b'\\' => {
// Decode escape sequence into `str`.
// `read_string_escape_sequence` expects `self.source` to be positioned after `\`.
// SAFETY: Next byte is `\`, which is ASCII, so `pos + 1` is UTF-8 char boundary.
Expand All @@ -315,6 +330,43 @@ impl<'a> Lexer<'a> {
// backwards from that, so subtracting 1 again is within bounds.
pos = unsafe {chunk_start.sub(1)};

// Continue searching
true
}
_ => {
// `TEMPLATE_LITERAL_ESCAPED_MATCH_TABLE` only matches `$`, '`', `\r`, `\`,
// or first byte of lossy replacement character
debug_assert!(next_byte == LOSSY_REPLACEMENT_CHAR_FIRST_BYTE);

// SAFETY: 0xEF is always first byte of a 3-byte UTF-8 character,
// so there must be 2 more bytes to read
let next2 = unsafe { pos.add(1).read2() };
if next2 == [LOSSY_REPLACEMENT_CHAR_BYTES[1], LOSSY_REPLACEMENT_CHAR_BYTES[2]]
&& self.token.lone_surrogates
{
str.push_str("\u{FFFD}fffd");
} else {
let bytes = [LOSSY_REPLACEMENT_CHAR_FIRST_BYTE, next2[0], next2[1]];
// SAFETY: 0xEF is always first byte of a 3-byte UTF-8 character,
// so these 3 bytes must comprise a valid UTF-8 string
let s = unsafe { str::from_utf8_unchecked(&bytes) };
str.push_str(s);
}

// Advance past this character.
// SAFETY: Character is 3 bytes, so `pos + 2` is in bounds.
// Note: `byte_search!` macro already advances `pos` by 1, so only
// advance by 2 here, so that in total we skip 3 bytes.
pos = unsafe { pos.add(2) };

// Set next chunk to start after this character.
// SAFETY: It's a 3 byte character, and we added 2 to `pos` above,
// so `pos + 1` must be a UTF-8 char boundary.
// This temporarily puts `chunk_start` 1 byte after `pos`, but `byte_search!` macro
// increments `pos` when return `true` from `continue_if`, so `pos` will be
// brought up to `chunk_start` again.
chunk_start = unsafe { pos.add(1) };

// Continue searching
true
}
Expand Down
7 changes: 6 additions & 1 deletion napi/parser/deserialize-js.js
Original file line number Diff line number Diff line change
Expand Up @@ -148,11 +148,16 @@ function deserializeTaggedTemplateExpression(pos) {
}

function deserializeTemplateElement(pos) {
let value = deserializeTemplateElementValue(pos + 8);
if (value.cooked !== null && deserializeBool(pos + 41)) {
value.cooked = value.cooked
.replace(/\uFFFD(.{4})/g, (_, hex) => String.fromCodePoint(parseInt(hex, 16)));
}
return {
type: 'TemplateElement',
start: deserializeU32(pos),
end: deserializeU32(pos + 4),
value: deserializeTemplateElementValue(pos + 8),
value,
tail: deserializeBool(pos + 40),
};
}
Expand Down
7 changes: 6 additions & 1 deletion napi/parser/deserialize-ts.js
Original file line number Diff line number Diff line change
Expand Up @@ -159,11 +159,16 @@ function deserializeTaggedTemplateExpression(pos) {
}

function deserializeTemplateElement(pos) {
let value = deserializeTemplateElementValue(pos + 8);
if (value.cooked !== null && deserializeBool(pos + 41)) {
value.cooked = value.cooked
.replace(/\uFFFD(.{4})/g, (_, hex) => String.fromCodePoint(parseInt(hex, 16)));
}
return {
type: 'TemplateElement',
start: deserializeU32(pos),
end: deserializeU32(pos + 4),
value: deserializeTemplateElementValue(pos + 8),
value,
tail: deserializeBool(pos + 40),
};
}
Expand Down
4 changes: 4 additions & 0 deletions napi/parser/test/parse-raw.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,10 @@ describe('edge cases', () => {
';"\\uD800\\uDBFF";',
';"�\\u{FFFD}";',
';"�\\u{FFFD}\\uD800\\uDBFF�\\u{FFFD}";',
// `TemplateLiteral`s containing lone surrogates and/or lossy replacement characters
'`\\uD800\\uDBFF${x}\\uD800\\uDBFF`;',
'`�\\u{FFFD}${x}�\\u{FFFD}`;',
'`�\\u{FFFD}\\uD800${x}\\uDBFF�\\u{FFFD}`;',
])('%s', (sourceText) => {
assertRawAndStandardMatch('dummy.js', sourceText);
});
Expand Down
Loading
Loading