diff --git a/crates/oxc_parser/Cargo.toml b/crates/oxc_parser/Cargo.toml index 8189fa7b2a16d..db925aad87940 100644 --- a/crates/oxc_parser/Cargo.toml +++ b/crates/oxc_parser/Cargo.toml @@ -21,7 +21,7 @@ doctest = false [dependencies] oxc_allocator = { workspace = true } oxc_ast = { workspace = true } -oxc_data_structures = { workspace = true, features = ["assert_unchecked"] } +oxc_data_structures = { workspace = true, features = ["assert_unchecked", "pointer_ext"] } oxc_diagnostics = { workspace = true } oxc_ecmascript = { workspace = true } oxc_regular_expression = { workspace = true, optional = true } diff --git a/crates/oxc_parser/src/lexer/comment.rs b/crates/oxc_parser/src/lexer/comment.rs index c1be03a9d3f80..ce6fc10ee51b8 100644 --- a/crates/oxc_parser/src/lexer/comment.rs +++ b/crates/oxc_parser/src/lexer/comment.rs @@ -94,7 +94,7 @@ impl<'a> Lexer<'a> { if next_byte == b'*' { // SAFETY: Next byte is `*` (ASCII) so after it is UTF-8 char boundary let after_star = unsafe { pos.add(1) }; - if after_star.addr() < self.source.end_addr() { + if after_star.is_not_end_of(&self.source) { // If next byte isn't `/`, continue // SAFETY: Have checked there's at least 1 further byte to read if unsafe { after_star.read() } == b'/' { diff --git a/crates/oxc_parser/src/lexer/identifier.rs b/crates/oxc_parser/src/lexer/identifier.rs index 689621e41b6ff..0669b567a57d9 100644 --- a/crates/oxc_parser/src/lexer/identifier.rs +++ b/crates/oxc_parser/src/lexer/identifier.rs @@ -215,7 +215,7 @@ impl<'a> Lexer<'a> { pub fn private_identifier(&mut self) -> Kind { // Handle EOF directly after `#` let start_pos = self.source.position(); - if start_pos.addr() == self.source.end_addr() { + if start_pos.is_end_of(&self.source) { return cold_branch(|| { let start = self.offset(); self.error(diagnostics::unexpected_end(Span::new(start, start))); diff --git a/crates/oxc_parser/src/lexer/search.rs b/crates/oxc_parser/src/lexer/search.rs index 7cf78628608a5..425b8ade9ef21 100644 --- a/crates/oxc_parser/src/lexer/search.rs +++ b/crates/oxc_parser/src/lexer/search.rs @@ -437,7 +437,7 @@ macro_rules! byte_search { // Silence warnings if macro called in unsafe code #[allow(unused_unsafe, clippy::unnecessary_safety_comment, clippy::allow_attributes)] 'outer: loop { - let $byte = if $pos.addr() <= $lexer.source.end_for_batch_search_addr() { + let $byte = if $pos.can_read_batch_from(&$lexer.source) { // Search a batch of `SEARCH_BATCH_SIZE` bytes. // // `'inner: loop {}` is not a real loop - it always exits on first turn. @@ -447,8 +447,8 @@ macro_rules! byte_search { // compiler to unroll it. // // SAFETY: - // `$pos.addr() <= lexer.source.end_for_batch_search_addr()` check above ensures - // there are at least `SEARCH_BATCH_SIZE` bytes remaining in `lexer.source`. + // `$pos.can_read_batch_from(&$lexer.source)` check above ensures there are + // at least `SEARCH_BATCH_SIZE` bytes remaining in `lexer.source`. // So calls to `$pos.read()` and `$pos.add(1)` in this loop cannot go out of bounds. 'inner: loop { for _i in 0..crate::lexer::search::SEARCH_BATCH_SIZE { @@ -469,9 +469,8 @@ macro_rules! byte_search { } else { // Not enough bytes remaining for a batch. Process byte-by-byte. // Same as above, `'inner: loop {}` is not a real loop here - always exits on first turn. - let end_addr = $lexer.source.end_addr(); 'inner: loop { - while $pos.addr() < end_addr { + while $pos.is_not_end_of(&$lexer.source) { // SAFETY: `pos` is not at end of source, so safe to read a byte let byte = unsafe { $pos.read() }; if $table.matches(byte) { diff --git a/crates/oxc_parser/src/lexer/source.rs b/crates/oxc_parser/src/lexer/source.rs index f6793dfc04626..7dc6903feee93 100644 --- a/crates/oxc_parser/src/lexer/source.rs +++ b/crates/oxc_parser/src/lexer/source.rs @@ -1,4 +1,6 @@ -use std::{marker::PhantomData, slice, str}; +use std::{cmp::Ordering, marker::PhantomData, slice, str}; + +use oxc_data_structures::pointer_ext::PointerExt; use crate::{MAX_LEN, UniquePromise}; @@ -107,12 +109,7 @@ impl<'a> Source<'a> { // SAFETY: // `start` and `end` are created from a `&str` in `Source::new`, so `start` cannot be after `end`. // `start` and `end` are by definition on UTF-8 char boundaries. - unsafe { - self.str_between_positions_unchecked( - SourcePosition::new(self.start), - SourcePosition::new(self.end), - ) - } + unsafe { self.str_between_positions_unchecked(self.start(), self.end()) } } /// Get remaining source text as `&str`. @@ -121,31 +118,35 @@ impl<'a> Source<'a> { // SAFETY: // Invariant of `Source` is that `ptr` is always <= `end`, and is on a UTF-8 char boundary. // `end` is pointer to end of original `&str`, so by definition on a UTF-8 char boundary. - unsafe { - self.str_between_positions_unchecked( - SourcePosition::new(self.ptr), - SourcePosition::new(self.end), - ) - } + unsafe { self.str_between_positions_unchecked(self.position(), self.end()) } } - /// Return whether at end of source. + /// Get number of bytes of source text remaining. #[inline] - pub(super) fn is_eof(&self) -> bool { - self.ptr == self.end + fn remaining_bytes(&self) -> usize { + // SAFETY: Invariant of `Source` is that `ptr` is always <= `end` + unsafe { self.end().offset_from(self.position()) } + } + + /// Get [`SourcePosition`] for start of source. + #[inline] + fn start(&self) -> SourcePosition<'a> { + // SAFETY: `start` is the start of source, so in bounds and on a UTF-8 char boundary + unsafe { SourcePosition::new(self.start) } } - /// Get end address. + /// Get [`SourcePosition`] for end of source. #[inline] - pub(super) fn end_addr(&self) -> usize { - self.end as usize + fn end(&self) -> SourcePosition<'a> { + // SAFETY: `end` is the end of source, so in bounds and on a UTF-8 char boundary + unsafe { SourcePosition::new(self.end) } } - /// Get last memory address at which a batch of `Lexer::search::SEARCH_BATCH_SIZE` bytes - /// can be read without going out of bounds. + /// Return whether at end of source. #[inline] - pub(super) fn end_for_batch_search_addr(&self) -> usize { - self.end_for_batch_search_addr + pub(super) fn is_eof(&self) -> bool { + // TODO: Use `self.remaining_bytes() == 0` instead? + self.ptr == self.end } /// Get current position. @@ -205,16 +206,16 @@ impl<'a> Source<'a> { debug_assert!(ascii_byte.is_ascii()); let matched = self.peek_byte() == Some(ascii_byte); if matched { - // SAFETY: next byte exists and is a valid ASCII char (and thus UTF-8 - // char boundary). + // SAFETY: next byte exists and is a valid ASCII char (and thus UTF-8 char boundary). self.ptr = unsafe { self.ptr.add(1) }; } matched } /// Get string slice from a `SourcePosition` up to the current position of `Source`. + #[inline] pub(super) fn str_from_pos_to_current(&self, pos: SourcePosition<'a>) -> &'a str { - assert!(pos.ptr <= self.ptr); + assert!(pos <= self.position()); // SAFETY: The above assertion satisfies `str_from_pos_to_current_unchecked`'s requirements unsafe { self.str_from_pos_to_current_unchecked(pos) } } @@ -231,9 +232,8 @@ impl<'a> Source<'a> { &self, pos: SourcePosition<'a>, ) -> &'a str { - // SAFETY: Caller guarantees `pos` is not after current position of `Source`. - // `self.ptr` is always a valid `SourcePosition` due to invariants of `Source`. - unsafe { self.str_between_positions_unchecked(pos, SourcePosition::new(self.ptr)) } + // SAFETY: Caller guarantees `pos` is not after current position of `Source` + unsafe { self.str_between_positions_unchecked(pos, self.position()) } } /// Get string slice from current position of `Source` up to a `SourcePosition`, without checks. @@ -248,18 +248,16 @@ impl<'a> Source<'a> { &self, pos: SourcePosition<'a>, ) -> &'a str { - // SAFETY: Caller guarantees `pos` is not before current position of `Source`. - // `self.ptr` is always a valid `SourcePosition` due to invariants of `Source`. - unsafe { self.str_between_positions_unchecked(SourcePosition::new(self.ptr), pos) } + // SAFETY: Caller guarantees `pos` is not before current position of `Source` + unsafe { self.str_between_positions_unchecked(self.position(), pos) } } /// Get string slice from a `SourcePosition` up to the end of `Source`. #[inline] pub(super) fn str_from_pos_to_end(&self, pos: SourcePosition<'a>) -> &'a str { // SAFETY: Invariants of `SourcePosition` is that it cannot be after end of `Source`, - // and always on a UTF-8 character boundary. - // `self.end` is always a valid `SourcePosition` due to invariants of `Source`. - unsafe { self.str_between_positions_unchecked(pos, SourcePosition::new(self.end)) } + // and always on a UTF-8 character boundary + unsafe { self.str_between_positions_unchecked(pos, self.end()) } } /// Get string slice of source between 2 `SourcePosition`s, without checks. @@ -296,24 +294,36 @@ impl<'a> Source<'a> { // on UTF-8 character boundaries. So slicing source text between these 2 points will always // yield a valid UTF-8 string. unsafe { - let len = end.addr() - start.addr(); + let len = end.offset_from(start); let slice = slice::from_raw_parts(start.ptr, len); std::str::from_utf8_unchecked(slice) } } - /// Get current position in source, relative to start of source. + /// Get current position in source, relative to start of source, as `u32`. #[inline] pub(super) fn offset(&self) -> u32 { self.offset_of(self.position()) } - /// Get offset of `pos`. - #[expect(clippy::cast_possible_truncation)] + /// Get current position in source, relative to start of source, as `usize`. + #[inline] + pub(super) fn offset_usize(&self) -> usize { + self.offset_of_usize(self.position()) + } + + /// Get offset of `pos` as `u32`. #[inline] pub(super) fn offset_of(&self, pos: SourcePosition<'a>) -> u32 { - // Cannot overflow `u32` because of `MAX_LEN` check in `Source::new` - (pos.addr() - self.start as usize) as u32 + // SAFETY: All `SourcePosition`s are always in bounds of the source text, which starts at `start` + unsafe { pos.offset_from_u32(self.start()) } + } + + /// Get offset of `pos` as `usize`. + #[inline] + pub(super) fn offset_of_usize(&self, pos: SourcePosition<'a>) -> usize { + // SAFETY: All `SourcePosition`s are always in bounds of the source text, which starts at `start` + unsafe { pos.offset_from(self.start()) } } /// Move current position back by `n` bytes. @@ -333,7 +343,7 @@ impl<'a> Source<'a> { assert!(n > 0, "Cannot call `Source::back` with 0"); // Ensure not attempting to go back to before start of source - let offset = self.ptr as usize - self.start as usize; + let offset = self.offset_usize(); assert!(n <= offset, "Cannot go back {n} bytes - only {offset} bytes consumed"); // SAFETY: We have checked that `n` is less than distance between `start` and `ptr`, @@ -525,11 +535,9 @@ impl<'a> Source<'a> { /// Peek next two bytes of source without consuming them. #[inline] pub(super) fn peek_2_bytes(&self) -> Option<[u8; 2]> { - // `end` is always >= `ptr` so `end - ptr` cannot wrap around. - // No need to use checked/saturating subtraction here. - if (self.end as usize) - (self.ptr as usize) >= 2 { + if self.remaining_bytes() >= 2 { // SAFETY: The check above ensures that there are at least 2 bytes to - // read from `self.ptr` without reading past `self.end` + // read from current position without reading past end let bytes = unsafe { self.position().read2() }; Some(bytes) } else { @@ -559,13 +567,13 @@ impl<'a> Source<'a> { /// # SAFETY /// `SourcePosition` must always be on a UTF-8 character boundary, /// and within bounds of the `Source` that created it. -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct SourcePosition<'a> { ptr: *const u8, _marker: PhantomData<&'a u8>, } -impl SourcePosition<'_> { +impl<'a> SourcePosition<'a> { /// Create a new `SourcePosition` from a pointer. /// /// # SAFETY @@ -579,12 +587,6 @@ impl SourcePosition<'_> { Self { ptr, _marker: PhantomData } } - /// Get memory address of `SourcePosition` as a `usize`. - #[inline] - pub(super) fn addr(self) -> usize { - self.ptr as usize - } - /// Create new `SourcePosition` which is `n` bytes after this one. /// The provenance of the pointer `SourcePosition` contains is maintained. /// @@ -611,6 +613,64 @@ impl SourcePosition<'_> { unsafe { Self::new(self.ptr.sub(n)) } } + /// Get the distance between this [`SourcePosition`] and another [`SourcePosition`] as `usize`. + /// + /// # SAFETY + /// `self` must be equal to or after `origin`. + #[inline] + pub(super) unsafe fn offset_from(self, origin: Self) -> usize { + // SAFETY: Caller guarantees `self` is not before `origin`. + // All `SourcePosition<'a>`s are within bounds of same source text. + unsafe { self.ptr.offset_from_usize(origin.ptr) } + } + + /// Get the distance between this [`SourcePosition`] and another [`SourcePosition`] as `u32`. + /// + /// # SAFETY + /// `self` must be equal to or after `origin`. + #[inline] + pub(super) unsafe fn offset_from_u32(self, origin: Self) -> u32 { + // SAFETY: Caller guarantees `self` is not before `origin`. + let offset = unsafe { self.offset_from(origin) }; + // Cannot overflow `u32` because of `MAX_LEN` check in `Source::new` + #[expect(clippy::cast_possible_truncation)] + let offset = offset as u32; + offset + } + + /// Get the distance between this [`SourcePosition`] and another [`SourcePosition`] as `isize`. + /// + /// Return value will be positive if `self` is after `other`, + /// negative if `self` is before `other`, or 0 if they're the same. + #[inline] + fn offset_from_signed(self, other: Self) -> isize { + // SAFETY: All `SourcePosition<'a>`s are within bounds of same source text + unsafe { self.ptr.offset_from(other.ptr) } + } + + /// Returns `true` if this [`SourcePosition`] is at end of `source`. + #[inline] + pub(super) fn is_end_of(self, source: &Source<'a>) -> bool { + // TODO: Use `source.end().offset_from(self) == 0` instead? + self.ptr == source.end + } + + /// Returns `true` if this [`SourcePosition`] is not at end of `source`. + #[inline] + pub(super) fn is_not_end_of(self, source: &Source<'a>) -> bool { + !self.is_end_of(source) + } + + /// Check if this [`SourcePosition`] is valid for reading `Lexer::search::SEARCH_BATCH_SIZE` bytes + /// from `source`. + /// i.e. is position at least `Lexer::search::SEARCH_BATCH_SIZE` bytes from the end of `source`? + /// + /// Returns `true` if safe to read `Lexer::search::SEARCH_BATCH_SIZE` from this position. + #[inline] + pub(super) fn can_read_batch_from(&self, source: &Source<'a>) -> bool { + self.ptr as usize <= source.end_for_batch_search_addr + } + /// Read byte from this `SourcePosition`. /// /// # SAFETY @@ -664,6 +724,50 @@ impl SourcePosition<'_> { } } +// Implement `Ord` and `PartialOrd` using `(*const u8)::offset_from` to utilize the invariant +// that `SourcePosition`s are always within the same allocation. +impl Ord for SourcePosition<'_> { + #[inline] + fn cmp(&self, other: &Self) -> Ordering { + let offset = self.offset_from_signed(*other); + #[expect(clippy::comparison_chain)] + if offset < 0 { + Ordering::Less + } else if offset == 0 { + Ordering::Equal + } else { + Ordering::Greater + } + } +} + +impl PartialOrd for SourcePosition<'_> { + #[inline] + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } + + #[inline] + fn lt(&self, other: &Self) -> bool { + self.offset_from_signed(*other) < 0 + } + + #[inline] + fn le(&self, other: &Self) -> bool { + self.offset_from_signed(*other) <= 0 + } + + #[inline] + fn gt(&self, other: &Self) -> bool { + self.offset_from_signed(*other) > 0 + } + + #[inline] + fn ge(&self, other: &Self) -> bool { + self.offset_from_signed(*other) >= 0 + } +} + /// Return if byte is a UTF-8 continuation byte. #[inline] const fn is_utf8_cont_byte(byte: u8) -> bool { diff --git a/crates/oxc_parser/src/lexer/template.rs b/crates/oxc_parser/src/lexer/template.rs index f6da2531a892b..254da47b79848 100644 --- a/crates/oxc_parser/src/lexer/template.rs +++ b/crates/oxc_parser/src/lexer/template.rs @@ -50,7 +50,7 @@ impl<'a> Lexer<'a> { b'$' => { // SAFETY: Next byte is `$` which is ASCII, so after it is a UTF-8 char boundary let after_dollar = unsafe { pos.add(1) }; - if after_dollar.addr() < self.source.end_addr() { + if after_dollar.is_not_end_of(&self.source) { // If `${`, exit. // SAFETY: Have checked there's at least 1 further byte to read. if unsafe { after_dollar.read() } == b'{' { @@ -118,7 +118,7 @@ impl<'a> Lexer<'a> { pos = unsafe { pos.add(1) }; // If at EOF, exit. This illegal in valid JS, so cold branch. - if pos.addr() == self.source.end_addr() { + if pos.is_end_of(&self.source) { return cold_branch(|| { self.source.advance_to_end(); self.error(diagnostics::unterminated_string(self.unterminated_range())); @@ -229,7 +229,7 @@ impl<'a> Lexer<'a> { if next_byte == b'$' { // SAFETY: Next byte is `$` which is ASCII, so after it is a UTF-8 char boundary let after_dollar = unsafe {pos.add(1)}; - if after_dollar.addr() < self.source.end_addr() { + if after_dollar.is_not_end_of(&self.source) { // If `${`, exit. // SAFETY: Have checked there's at least 1 further byte to read. if unsafe {after_dollar.read()} == b'{' { @@ -280,7 +280,7 @@ impl<'a> Lexer<'a> { // brought up to `chunk_start` again. chunk_start = unsafe {pos.add(1)}; - if chunk_start.addr() < self.source.end_addr() { + if chunk_start.is_not_end_of(&self.source) { // Either `\r` alone or `\r\n` needs to be converted to `\n`. // SAFETY: Have checked not at EOF. if unsafe {chunk_start.read()} == b'\n' { @@ -321,7 +321,7 @@ impl<'a> Lexer<'a> { // Start next chunk after escape sequence chunk_start = self.source.position(); - assert!(chunk_start.addr() >= after_backslash.addr()); + assert!(chunk_start >= after_backslash); // Continue search after escape sequence. // NB: `byte_search!` macro increments `pos` when return `true`,