From 5b139aa4eef37395fa401618ccfd5167c4dfedfa Mon Sep 17 00:00:00 2001 From: overlookmotel <557937+overlookmotel@users.noreply.github.com> Date: Sun, 31 Aug 2025 00:31:03 +0000 Subject: [PATCH] feat(data_structures): add `ptr` and `end_ptr` methods to `SliceIterExt` (#13435) Add `ptr` and `end_ptr` methods to `SliceIterExt`. These methods get pointers to the start and end of slice iterators, using the minimum possible number of instructions (1). `end_ptr` also avoids unsafe code, unlike `iter.as_slice().as_ptr().add(iter.as_slice().len())`, which we used previously. Use these methods in codegen. --- crates/oxc_codegen/src/sourcemap_builder.rs | 6 +- crates/oxc_codegen/src/str.rs | 6 +- .../oxc_data_structures/src/slice_iter_ext.rs | 106 +++++++++++++++++- crates/oxc_estree/src/serialize/strings.rs | 16 ++- 4 files changed, 117 insertions(+), 17 deletions(-) diff --git a/crates/oxc_codegen/src/sourcemap_builder.rs b/crates/oxc_codegen/src/sourcemap_builder.rs index 2f4c84a432652..770aa48d1f84e 100644 --- a/crates/oxc_codegen/src/sourcemap_builder.rs +++ b/crates/oxc_codegen/src/sourcemap_builder.rs @@ -282,7 +282,7 @@ impl<'a> SourcemapBuilder<'a> { // Line break found. // `iter` is now positioned after line break. - line_start_ptr = iter.as_slice().as_ptr(); + line_start_ptr = iter.ptr(); self.generated_line += 1; self.generated_column = 0; last_line_is_ascii = true; @@ -290,8 +290,8 @@ impl<'a> SourcemapBuilder<'a> { // Calculate column self.generated_column += if last_line_is_ascii { - // `iter` is now exhausted, so `iter.as_slice().as_ptr()` is pointer to end of `output` - (iter.as_slice().as_ptr() as usize - line_start_ptr as usize) as u32 + // `iter` is now exhausted, so `iter.ptr()` is pointer to end of `output` + (iter.ptr() as usize - line_start_ptr as usize) as u32 } else { let line_byte_offset = line_start_ptr as usize - remaining.as_ptr() as usize; // TODO: It'd be better if could use `from_utf8_unchecked` here, but we'd need to make this diff --git a/crates/oxc_codegen/src/str.rs b/crates/oxc_codegen/src/str.rs index 904da0ea01891..30b115ea27525 100644 --- a/crates/oxc_codegen/src/str.rs +++ b/crates/oxc_codegen/src/str.rs @@ -46,7 +46,7 @@ impl Codegen<'_> { // String is written to buffer in chunks. let bytes = s.value.as_bytes().iter(); let mut state = PrintStringState { - chunk_start: bytes.as_slice().as_ptr(), + chunk_start: bytes.ptr(), bytes, quote, lone_surrogates: s.lone_surrogates, @@ -139,7 +139,7 @@ impl PrintStringState<'_> { /// Set the start of next chunk to be current position of `bytes` iterator. #[inline] fn start_chunk(&mut self) { - self.chunk_start = self.bytes.as_slice().as_ptr(); + self.chunk_start = self.bytes.ptr(); } /// Flush current chunk to buffer, consume 1 byte, and start next chunk after that byte. @@ -183,7 +183,7 @@ impl PrintStringState<'_> { // SAFETY: `chunk_start` is pointer to current position of `bytes` iterator at some point, // and the iterator only advances, so current position of `bytes` must be on or after `chunk_start` let len = unsafe { - let bytes_ptr = self.bytes.as_slice().as_ptr(); + let bytes_ptr = self.bytes.ptr(); bytes_ptr.offset_from_unsigned(self.chunk_start) }; diff --git a/crates/oxc_data_structures/src/slice_iter_ext.rs b/crates/oxc_data_structures/src/slice_iter_ext.rs index f460ad0d9091a..ede4bd494109d 100644 --- a/crates/oxc_data_structures/src/slice_iter_ext.rs +++ b/crates/oxc_data_structures/src/slice_iter_ext.rs @@ -1,6 +1,6 @@ //! Extension trait for slice iterators. //! -//! Provides additional methods to advance iterators. +//! Provides additional methods to inspect and advance iterators. //! //! See [`SliceIterExt`]. @@ -14,7 +14,7 @@ use crate::assert_unchecked; /// Extension trait for slice iterators. #[expect(private_bounds)] -pub trait SliceIterExt<'slice, T>: ExactSizeIterator + Sealed { +pub trait SliceIterExt<'slice, T>: ExactSizeIterator + AsRef<[T]> + Sealed { /// The type returned by `peek` method. type Peeked<'iter> where @@ -60,6 +60,24 @@ pub trait SliceIterExt<'slice, T>: ExactSizeIterator + Sealed { /// # SAFETY /// Iterator must contain at least `count` more items. unsafe fn advance_unchecked(&mut self, count: usize); + + /// Get pointer to next item in the iterator. + /// + /// Pointer is only valid to read an item from if iterator is not empty. + #[inline] + fn ptr(&self) -> *const T { + let slice = self.as_ref(); + slice.as_ptr() + } + + /// Get pointer to after last item in the iterator. + /// + /// Pointer is the end bound of the slice, so is not valid for reads. + #[inline] + fn end_ptr(&self) -> *const T { + let slice = self.as_ref(); + slice.as_ptr_range().end + } } impl<'slice, T: 'slice> SliceIterExt<'slice, T> for Iter<'slice, T> { @@ -300,6 +318,48 @@ mod test_iter { assert_eq!(iter.next(), None); } } + + #[test] + fn ptr() { + let slice = [11u32, 22, 33]; + let start_addr = slice.as_ptr() as usize; + + let mut iter = slice.iter(); + assert_eq!(iter.ptr() as usize, start_addr); + + iter.next(); + assert_eq!(iter.ptr() as usize, start_addr + size_of::()); + + iter.next(); + assert_eq!(iter.ptr() as usize, start_addr + size_of::() * 2); + + iter.next(); + assert_eq!(iter.ptr() as usize, start_addr + size_of::() * 3); + + iter.next(); + assert_eq!(iter.ptr() as usize, start_addr + size_of::() * 3); + } + + #[test] + fn end_ptr() { + let slice = [11u32, 22, 33]; + let end_addr = slice.as_ptr() as usize + size_of::() * 3; + + let mut iter = slice.iter(); + assert_eq!(iter.end_ptr() as usize, end_addr); + + iter.next(); + assert_eq!(iter.end_ptr() as usize, end_addr); + + iter.next(); + assert_eq!(iter.end_ptr() as usize, end_addr); + + iter.next(); + assert_eq!(iter.end_ptr() as usize, end_addr); + + iter.next(); + assert_eq!(iter.end_ptr() as usize, end_addr); + } } #[cfg(test)] @@ -416,4 +476,46 @@ mod test_iter_mut { assert_eq!(iter.next(), None); } } + + #[test] + fn ptr() { + let mut slice = [11u32, 22, 33]; + let start_addr = slice.as_ptr() as usize; + + let mut iter = slice.iter_mut(); + assert_eq!(iter.ptr() as usize, start_addr); + + iter.next(); + assert_eq!(iter.ptr() as usize, start_addr + size_of::()); + + iter.next(); + assert_eq!(iter.ptr() as usize, start_addr + size_of::() * 2); + + iter.next(); + assert_eq!(iter.ptr() as usize, start_addr + size_of::() * 3); + + iter.next(); + assert_eq!(iter.ptr() as usize, start_addr + size_of::() * 3); + } + + #[test] + fn end_ptr() { + let mut slice = [11u32, 22, 33]; + let end_addr = slice.as_ptr() as usize + size_of::() * 3; + + let mut iter = slice.iter_mut(); + assert_eq!(iter.end_ptr() as usize, end_addr); + + iter.next(); + assert_eq!(iter.end_ptr() as usize, end_addr); + + iter.next(); + assert_eq!(iter.end_ptr() as usize, end_addr); + + iter.next(); + assert_eq!(iter.end_ptr() as usize, end_addr); + + iter.next(); + assert_eq!(iter.end_ptr() as usize, end_addr); + } } diff --git a/crates/oxc_estree/src/serialize/strings.rs b/crates/oxc_estree/src/serialize/strings.rs index 359a59ff0c155..42709c3c34363 100644 --- a/crates/oxc_estree/src/serialize/strings.rs +++ b/crates/oxc_estree/src/serialize/strings.rs @@ -326,7 +326,7 @@ fn write_str(s: &str, buffer: &mut CodeBuffer) { // an ASCII character, so must also be on a UTF-8 character boundary, and in bounds. // `chunk_start_ptr` is after a previous byte so must be `<= current_ptr`. unsafe { - let current_ptr = iter.as_slice().as_ptr(); + let current_ptr = iter.ptr(); let len = current_ptr.offset_from_unsigned(chunk_start_ptr); let chunk = slice::from_raw_parts(chunk_start_ptr, len); buffer.print_bytes_unchecked(chunk); @@ -348,7 +348,7 @@ fn write_str(s: &str, buffer: &mut CodeBuffer) { // Set `chunk_start_ptr` to after `\u{FFFD}fffd`. // That's a complete UTF-8 sequence, so `chunk_start_ptr` is definitely // left on a UTF-8 character boundary. - chunk_start_ptr = iter.as_slice().as_ptr(); + chunk_start_ptr = iter.ptr(); } else { // This is an escaped lone surrogate. // Next 4 bytes should be code point encoded as 4 x hex bytes. @@ -360,7 +360,7 @@ fn write_str(s: &str, buffer: &mut CodeBuffer) { // Print `\u`. Leave the hex bytes to be printed in next batch. // After lossy replacement character is definitely a UTF-8 boundary. buffer.print_str("\\u"); - chunk_start_ptr = iter.as_slice().as_ptr(); + chunk_start_ptr = iter.ptr(); // SAFETY: `iter.as_slice().get(..4).unwrap()` above would have panicked // if there weren't at least 4 bytes remaining in `iter`. @@ -378,7 +378,7 @@ fn write_str(s: &str, buffer: &mut CodeBuffer) { } // Print the chunk up to before the character which requires escaping. - let current_ptr = iter.as_slice().as_ptr(); + let current_ptr = iter.ptr(); // SAFETY: `escape` is only non-zero for ASCII bytes, except `Escape::LO` which is handled above. // Therefore `current_ptr` must be on an ASCII byte. // `chunk_start_ptr` is start of string originally, and is only updated to be after @@ -398,17 +398,15 @@ fn write_str(s: &str, buffer: &mut CodeBuffer) { // Set `chunk_start_ptr` to be after this character. // `escape` is only non-zero for ASCII bytes, except `Escape::LO` which is handled above. // We just consumed that ASCII byte, so `chunk_start_ptr` must be on a UTF-8 char boundary. - chunk_start_ptr = iter.as_slice().as_ptr(); + chunk_start_ptr = iter.ptr(); } // Print last chunk. - // SAFETY: Adding `len` to `ptr` cannot be out of bounds. - let end_ptr = unsafe { iter.as_slice().as_ptr().add(iter.as_slice().len()) }; // SAFETY: `chunk_start_ptr` is start of string originally, and is only updated to be after // an ASCII character, so must be on a UTF-8 character boundary, and in bounds. - // `chunk_start_ptr` is after a previous byte so must be `<= end_ptr`. + // `chunk_start_ptr` is after a previous byte so must be `<= iter.end_ptr()`. unsafe { - let len = end_ptr.offset_from_unsigned(chunk_start_ptr); + let len = iter.end_ptr().offset_from_unsigned(chunk_start_ptr); let chunk = slice::from_raw_parts(chunk_start_ptr, len); buffer.print_bytes_unchecked(chunk); }