Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 9 additions & 23 deletions crates/oxc_ast_visit/src/utf8_to_utf16/converter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use super::Translation;
/// This range starts at byte `range_start`, and is `range_len` bytes long.
/// The range describes a stretch of source text which contains only ASCII characters.
/// A UTF-8 offset within this range can be converted to UTF-16 offset with the formula
/// `utf16_offset = (utf8_offset - range_start_utf8).wrapping_add(range_start_utf16)`.
/// `utf16_offset = utf8_offset - range_start_utf8 + range_start_utf16`.
///
/// [`convert_offset`] has a very fast path for converting offsets in the current range.
///
Expand All @@ -36,12 +36,10 @@ pub struct Utf8ToUtf16Converter<'t> {
range_len_utf8: u32,
/// UTF-16 offset of start of range.
/// To convert offset within this range:
/// `utf16_offset = (utf8_offset - range_start_utf8).wrapping_add(range_start_utf16)`.
/// Note: `range_start_utf16` is calculated and used with wrapping addition/subtraction,
/// because it can wrap around when there's a Unicode character very close to start of source.
/// `utf16_offset = utf8_offset - range_start_utf8 + range_start_utf16`.
/// We store UTF-16 range start, rather than `utf16_difference`, because it makes
/// [`Self::convert_offset`] more efficient - 1 less instruction, and 1 less register.
/// <https://godbolt.org/z/1xnx1v17T>
/// <https://godbolt.org/z/hz5xWGfYn>
range_start_utf16: u32,
/// Index of current `Translation`
index: u32,
Expand Down Expand Up @@ -111,7 +109,7 @@ impl<'t> Utf8ToUtf16Converter<'t> {
//
// This method is written to reduce this common path to as few instructions as possible.
// It's only 8 instructions on x86_64, with 2 branches, and using only 1 register.
// https://godbolt.org/z/1xnx1v17T
// https://godbolt.org/z/hz5xWGfYn
//
// `#[inline(always)]` because this function is small and on a very hot path.
#[expect(clippy::inline_always)]
Expand All @@ -134,10 +132,9 @@ impl<'t> Utf8ToUtf16Converter<'t> {
}

let bytes_from_start_of_range = utf8_offset.wrapping_sub(self.range_start_utf8);
if bytes_from_start_of_range <= self.range_len_utf8 {
// Offset is within current range.
// `wrapping_add` because `range_start_utf16` can be `u32::MAX`.
*offset = self.range_start_utf16.wrapping_add(bytes_from_start_of_range);
if bytes_from_start_of_range < self.range_len_utf8 {
// Offset is within current range
*offset = self.range_start_utf16 + bytes_from_start_of_range;
} else {
// Offset is outside current range - slow path
self.convert_offset_slow(offset);
Expand Down Expand Up @@ -179,18 +176,7 @@ impl<'t> Utf8ToUtf16Converter<'t> {
self.index = index as u32;
self.range_start_utf8 = range_start_utf8;
self.range_len_utf8 = range_end_utf8 - range_start_utf8;

// `wrapping_sub` because `utf16_difference` can be `> range_start_utf8` where one of
// first few characters of source is Unicode. e.g.:
//
// * 1st char is Unicode:
// * `range_start_utf8 = 1` (offsets in `Translation`s are the offset of the character + 1).
// * `utf16_difference` is the length of the Unicode char, which is `> 1`.
//
// * If 1st 2 chars are ASCII, but 3rd char is a 4-byte Unicode char:
// * `range_start_utf8 = 3`.
// * `utf16_difference = 4`.
self.range_start_utf16 = range_start_utf8.wrapping_sub(utf16_difference);
self.range_start_utf16 = range_start_utf8 - utf16_difference;

*offset = utf8_offset - utf16_difference;
}
Expand Down Expand Up @@ -252,7 +238,7 @@ impl<'t> Utf8ToUtf16Converter<'t> {
const LINEAR_SEARCH_ITERATIONS: usize = 8;

// `utf8_offset` is after current range, so there must be another range after this one.
// We don't need to include next range in search because we know it starts before `utf8_offset`,
// We don't need to include next range in search because we know it starts on or before `utf8_offset`,
// and we're looking for a range which starts *after* `utf8_offset`.
//
// Note: `translations` is a slice, which has max length of `isize::MAX` on all platforms.
Expand Down
16 changes: 8 additions & 8 deletions crates/oxc_ast_visit/src/utf8_to_utf16/translation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,17 +82,17 @@ pub fn build_translations(source_text: &str, translations: &mut Vec<Translation>
// Closure that processes a slice of bytes
let mut process_slice = |slice: &[u8], start_offset: usize| {
for (index, &byte) in slice.iter().enumerate() {
#[expect(clippy::cast_possible_truncation)]
if byte >= 0xC0 {
let difference_for_this_byte = u32::from(byte >= 0xE0) + 1;
utf16_difference += difference_for_this_byte;
// Record `offset + 1` not `offset`, because it's only offsets *after* this
// Unicode character that need to be shifted.
// `offset + 1` cannot overflow, because source is limited to `u32::MAX` bytes,
// so a multi-byte Unicode character can't start at offset `u32::MAX`, because there
// isn't space to complete the multi-byte sequence, which would not be a valid `&str`.
let offset = start_offset + index;
let utf8_offset = (offset + 1) as u32;

// Record the index of the end of this Unicode character, because it's only offsets
// *after* this Unicode character that need to be shifted.
// Addition cannot overflow because length of source text is max `u32::MAX`.
let bytes_in_char =
difference_for_this_byte as usize + usize::from(byte >= 0xF0) + 1;
#[expect(clippy::cast_possible_truncation)]
let utf8_offset = (start_offset + index + bytes_in_char) as u32;
translations.push(Translation { utf8_offset, utf16_difference });
}
}
Expand Down
Loading