Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion crates/oxc_parser/src/lexer/comment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ impl<'a> Lexer<'a> {
if next_byte == b'*' {
// SAFETY: Next byte is `*` (ASCII) so after it is UTF-8 char boundary
let after_star = unsafe { pos.add(1) };
if after_star.addr() < self.source.end_addr() {
if after_star.is_not_end_of(&self.source) {
// If next byte isn't `/`, continue
// SAFETY: Have checked there's at least 1 further byte to read
if unsafe { after_star.read() } == b'/' {
Expand Down
2 changes: 1 addition & 1 deletion crates/oxc_parser/src/lexer/identifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ impl<'a> Lexer<'a> {
pub fn private_identifier(&mut self) -> Kind {
// Handle EOF directly after `#`
let start_pos = self.source.position();
if start_pos.addr() == self.source.end_addr() {
if start_pos.is_end_of(&self.source) {
return cold_branch(|| {
let start = self.offset();
self.error(diagnostics::unexpected_end(Span::new(start, start)));
Expand Down
12 changes: 8 additions & 4 deletions crates/oxc_parser/src/lexer/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,7 @@ macro_rules! byte_search {
let mut $pos = $start;
#[allow(unused_unsafe, clippy::unnecessary_safety_comment)] // Silence warnings if macro called in unsafe code
'outer: loop {
let $byte = if $pos.addr() <= $lexer.source.end_for_batch_search_addr() {
let $byte = if $lexer.source.has_search_batch_remaining_from($pos) {
// Search a batch of `SEARCH_BATCH_SIZE` bytes.
//
// `'inner: loop {}` is not a real loop - it always exits on first turn.
Expand All @@ -444,7 +444,7 @@ macro_rules! byte_search {
// compiler to unroll it.
//
// SAFETY:
// `$pos.addr() <= lexer.source.end_for_batch_search_addr()` check above ensures
// `$lexer.source.has_search_batch_remaining_from($pos)` check above ensures
// there are at least `SEARCH_BATCH_SIZE` bytes remaining in `lexer.source`.
// So calls to `$pos.read()` and `$pos.add(1)` in this loop cannot go out of bounds.
'inner: loop {
Expand All @@ -466,9 +466,13 @@ macro_rules! byte_search {
} else {
// Not enough bytes remaining for a batch. Process byte-by-byte.
// Same as above, `'inner: loop {}` is not a real loop here - always exits on first turn.
let end_addr = $lexer.source.end_addr();
let end_pos = $lexer.source.end_position();
'inner: loop {
while $pos.addr() < end_addr {
// SAFETY: `end_pos` is end of source, so `$pos` must be on or before it.
// Note: `$pos < end_pos` would be semantically equivalent, but use `offset_from`
// because it takes advantage of `Source`'s invariant that `$pos` can never be
// after `end_pos`, which may allow compiler to produce better codegen.
while unsafe { end_pos.offset_from($pos) } > 0 {
// SAFETY: `pos` is not at end of source, so safe to read a byte
let byte = unsafe { $pos.read() };
if $table.matches(byte) {
Expand Down
202 changes: 149 additions & 53 deletions crates/oxc_parser/src/lexer/source.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#![expect(clippy::unnecessary_safety_comment)]

use std::{marker::PhantomData, slice, str};
use std::{cmp::Ordering, marker::PhantomData, slice, str};

use assert_unchecked::assert_unchecked;

use crate::{UniquePromise, MAX_LEN};

Expand Down Expand Up @@ -109,45 +111,28 @@ impl<'a> Source<'a> {
// SAFETY:
// `start` and `end` are created from a `&str` in `Source::new`, so `start` cannot be after `end`.
// `start` and `end` are by definition on UTF-8 char boundaries.
unsafe {
self.str_between_positions_unchecked(
SourcePosition::new(self.start),
SourcePosition::new(self.end),
)
}
unsafe { self.str_between_positions_unchecked(self.start_position(), self.end_position()) }
}

/// Get remaining source text as `&str`.
#[inline]
pub(super) fn remaining(&self) -> &'a str {
// SAFETY:
// Invariant of `Source` is that `ptr` is always <= `end`, and is on a UTF-8 char boundary.
// `end` is pointer to end of original `&str`, so be definition a UTF-8 char boundary.
unsafe {
self.str_between_positions_unchecked(
SourcePosition::new(self.ptr),
SourcePosition::new(self.end),
)
}
// `end` is pointer to end of original `&str`, so by definition is a UTF-8 char boundary.
unsafe { self.str_between_positions_unchecked(self.position(), self.end_position()) }
}

/// Return whether at end of source.
#[inline]
pub(super) fn is_eof(&self) -> bool {
self.ptr == self.end
self.position().is_end_of(self)
}

/// Get end address.
/// Returns `true` if at least `Lexer::search::SEARCH_BATCH_SIZE` bytes remaining in source.
#[inline]
pub(super) fn end_addr(&self) -> usize {
self.end as usize
}

/// Get last memory address at which a batch of `Lexer::search::SEARCH_BATCH_SIZE` bytes
/// can be read without going out of bounds.
#[inline]
pub(super) fn end_for_batch_search_addr(&self) -> usize {
self.end_for_batch_search_addr
pub(super) fn has_search_batch_remaining_from(&self, pos: SourcePosition<'a>) -> bool {
pos.ptr as usize <= self.end_for_batch_search_addr
}

/// Get current position.
Expand All @@ -164,6 +149,24 @@ impl<'a> Source<'a> {
unsafe { SourcePosition::new(self.ptr) }
}

/// Get start of this `Source` as a `SourcePosition`.
///
/// `SourcePosition` lives as long as the source text `&str` that `Source` was created from.
#[inline]
pub(super) fn start_position(&self) -> SourcePosition<'a> {
// SAFETY: Creating a `SourcePosition` from start of `Source` is always valid
unsafe { SourcePosition::new(self.start) }
}

/// Get end of this `Source` as a `SourcePosition`.
///
/// `SourcePosition` lives as long as the source text `&str` that `Source` was created from.
#[inline]
pub(super) fn end_position(&self) -> SourcePosition<'a> {
// SAFETY: Creating a `SourcePosition` from end of `Source` is always valid
unsafe { SourcePosition::new(self.end) }
}

/// Move current position.
#[inline]
pub(super) fn set_position(&mut self, pos: SourcePosition<'a>) {
Expand Down Expand Up @@ -216,7 +219,7 @@ impl<'a> Source<'a> {

/// Get string slice from a `SourcePosition` up to the current position of `Source`.
pub(super) fn str_from_pos_to_current(&self, pos: SourcePosition<'a>) -> &'a str {
assert!(pos.ptr <= self.ptr);
assert!(pos <= self.position());
// SAFETY: The above assertion satisfies `str_from_pos_to_current_unchecked`'s requirements
unsafe { self.str_from_pos_to_current_unchecked(pos) }
}
Expand All @@ -235,7 +238,7 @@ impl<'a> Source<'a> {
) -> &'a str {
// SAFETY: Caller guarantees `pos` is not after current position of `Source`.
// `self.ptr` is always a valid `SourcePosition` due to invariants of `Source`.
self.str_between_positions_unchecked(pos, SourcePosition::new(self.ptr))
self.str_between_positions_unchecked(pos, self.position())
}

/// Get string slice from current position of `Source` up to a `SourcePosition`, without checks.
Expand All @@ -250,18 +253,16 @@ impl<'a> Source<'a> {
&self,
pos: SourcePosition<'a>,
) -> &'a str {
// SAFETY: Caller guarantees `pos` is not before current position of `Source`.
// `self.ptr` is always a valid `SourcePosition` due to invariants of `Source`.
self.str_between_positions_unchecked(SourcePosition::new(self.ptr), pos)
// SAFETY: Caller guarantees `pos` is not before current position of `Source`
self.str_between_positions_unchecked(self.position(), pos)
}

/// Get string slice from a `SourcePosition` up to the end of `Source`.
#[inline]
pub(super) fn str_from_pos_to_end(&self, pos: SourcePosition<'a>) -> &'a str {
// SAFETY: Invariants of `SourcePosition` is that it cannot be after end of `Source`,
// and always on a UTF-8 character boundary.
// `self.end` is always a valid `SourcePosition` due to invariants of `Source`.
unsafe { self.str_between_positions_unchecked(pos, SourcePosition::new(self.end)) }
// and always on a UTF-8 character boundary
unsafe { self.str_between_positions_unchecked(pos, self.end_position()) }
}

/// Get string slice of source between 2 `SourcePosition`s, without checks.
Expand Down Expand Up @@ -295,8 +296,8 @@ impl<'a> Source<'a> {
// Invariants of `Source` and `SourcePosition` types guarantee that both are positioned
// on UTF-8 character boundaries. So slicing source text between these 2 points will always
// yield a valid UTF-8 string.
let len = end.addr() - start.addr();
let slice = slice::from_raw_parts(start.ptr, len);
let len = end.offset_from(start);
let slice = slice::from_raw_parts(start.ptr, len as usize);
std::str::from_utf8_unchecked(slice)
}

Expand All @@ -307,11 +308,16 @@ impl<'a> Source<'a> {
}

/// Get offset of `pos`.
#[expect(clippy::cast_possible_truncation)]
#[inline]
pub(super) fn offset_of(&self, pos: SourcePosition<'a>) -> u32 {
// Cannot overflow `u32` because of `MAX_LEN` check in `Source::new`
(pos.addr() - self.start as usize) as u32
// SAFETY: `SourcePosition`s can only be created from a `Source`.
// `Source::new` takes a `UniquePromise`, which guarantees that it's the only `Source`
// in existence on this thread. `Source` is not `Sync` or `Send`, so no possibility another
// `Source` originated on another thread can "jump" onto this one.
// This is sufficient to guarantee that any `SourcePosition` that parser/lexer holds must be
// from this `Source`. `SourcePosition`s must always be within bounds of the `Source` they were
// created from, and therefore `pos` is guaranteed to be equal to or after `self.start`.
unsafe { pos.offset_from(self.start_position()) }
}

/// Move current position back by `n` bytes.
Expand All @@ -331,7 +337,7 @@ impl<'a> Source<'a> {
assert!(n > 0, "Cannot call `Source::back` with 0");

// Ensure not attempting to go back to before start of source
let offset = self.ptr as usize - self.start as usize;
let offset = self.offset() as usize;
assert!(n <= offset, "Cannot go back {n} bytes - only {offset} bytes consumed");

// SAFETY: We have checked that `n` is less than distance between `start` and `ptr`,
Expand Down Expand Up @@ -521,9 +527,9 @@ impl<'a> Source<'a> {
/// Peek next two bytes of source without consuming them.
#[inline]
pub(super) fn peek_2_bytes(&self) -> Option<[u8; 2]> {
// `end` is always >= `ptr` so `end - ptr` cannot wrap around.
// No need to use checked/saturating subtraction here.
if (self.end as usize) - (self.ptr as usize) >= 2 {
// SAFETY: `end` is always >= `ptr`
let remaining_len = unsafe { self.end_position().offset_from(self.position()) };
if remaining_len >= 2 {
// SAFETY: The check above ensures that there are at least 2 bytes to
// read from `self.ptr` without reading past `self.end`
let bytes = unsafe { self.position().read2() };
Expand Down Expand Up @@ -560,7 +566,7 @@ pub struct SourcePosition<'a> {
_marker: PhantomData<&'a u8>,
}

impl SourcePosition<'_> {
impl<'a> SourcePosition<'a> {
/// Create a new `SourcePosition` from a pointer.
///
/// # SAFETY
Expand All @@ -574,12 +580,6 @@ impl SourcePosition<'_> {
Self { ptr, _marker: PhantomData }
}

/// Get memory address of `SourcePosition` as a `usize`.
#[inline]
pub(super) fn addr(self) -> usize {
self.ptr as usize
}

/// Create new `SourcePosition` which is `n` bytes after this one.
/// The provenance of the pointer `SourcePosition` contains is maintained.
///
Expand All @@ -604,6 +604,80 @@ impl SourcePosition<'_> {
Self::new(self.ptr.sub(n))
}

/// Get offset of this `SourcePosition` relative to another `SourcePosition`.
///
/// # SAFETY
/// `self` must be after or equal to `other`.
/// i.e. `end.offset_from(start)`.
#[inline]
pub(super) unsafe fn offset_from(self, other: Self) -> u32 {
// SAFETY: `SourcePosition`s can only be created from a `Source`.
// `Source::new` takes a `UniquePromise`, which guarantees that it's the only `Source`
// in existence on this thread. `Source` is not `Sync` or `Send`, so no possibility another
// `Source` originated on another thread can "jump" onto this one.
// This is sufficient to guarantee that any `SourcePosition` that parser/lexer holds must be
// from this `Source`, therefore `self` and `other` must both be within the same
// allocation, and derived from the same original pointer.
// Source text length is limited to `MAX_LEN`, which cannot be more than either `u32::MAX`
// or `isize::MAX`, so distance between `self` and `other` cannot exceed either.
let offset = self.distance_from(other);
debug_assert!(u32::try_from(offset).is_ok());
u32::try_from(offset).unwrap_unchecked()
}

/// Get distance of this `SourcePosition` relative to another `SourcePosition`.
///
/// `self` can be before or after `other`.
/// * If `self` is after `other`, return value is positive.
/// * If `self` is before `other`, return value is negative.
#[inline]
fn distance_from(self, other: Self) -> isize {
debug_assert!(if self.ptr < other.ptr {
other.ptr as usize - self.ptr as usize <= MAX_LEN
} else {
self.ptr as usize - other.ptr as usize <= MAX_LEN
});

// SAFETY: `SourcePosition`s can only be created from a `Source`.
// `Source::new` takes a `UniquePromise`, which guarantees that it's the only `Source`
// in existence on this thread. `Source` is not `Sync` or `Send`, so no possibility another
// `Source` originated on another thread can "jump" onto this one.
// This is sufficient to guarantee that any `SourcePosition` that parser/lexer holds must be
// from this `Source`, therefore `self` and `other` must both be within the same
// allocation, and derived from the same original pointer.
// Source text length is limited to `MAX_LEN`, which cannot be more than either `u32::MAX`
// or `isize::MAX`, so distance between `self` and `other` cannot exceed either.
// The `isize::MAX` constraint is required to satisfy safety conditions of `ptr::offset_from`.
unsafe {
let offset = self.ptr.offset_from(other.ptr);

// These assertions aim to give the compiler as much information as possible about
// invariants constraining the possible values of `offset`
assert_unchecked!(offset != isize::MIN);
let offset_abs = offset.checked_abs().unwrap_unchecked();
assert_unchecked!(usize::try_from(offset_abs).unwrap_unchecked() <= MAX_LEN);

offset
}
}

/// Returns `true` if this `SourcePosition` is at end of source.
#[inline]
pub(super) fn is_end_of(self, source: &Source<'a>) -> bool {
// SAFETY: `SourcePosition`s can only be created from a `Source`.
// `Source::new` takes a `UniquePromise`, which guarantees that it's the only `Source`
// in existence on this thread. `Source` is not `Sync` or `Send`, so no possibility another
// `Source` originated on another thread can "jump" onto this one.
// Therefore `self` must be from provided `Source`, and so cannot be after source's end.
unsafe { source.end_position().offset_from(self) == 0 }
}

/// Returns `true` if this `SourcePosition` is not at end of source.
#[inline]
pub(super) fn is_not_end_of(self, source: &Source<'a>) -> bool {
!self.is_end_of(source)
}

/// Read byte from this `SourcePosition`.
///
/// # SAFETY
Expand Down Expand Up @@ -640,19 +714,41 @@ impl SourcePosition<'_> {
#[inline]
pub(super) unsafe fn read2(self) -> [u8; 2] {
// SAFETY:
// Caller guarantees `self` is not at no later than 2 bytes before end of source text.
// Caller guarantees `self` is at no later than 2 bytes before end of source text.
// `Source` is created from a valid `&str`, so points to allocated, initialized memory.
// `Source` conceptually holds the source text `&str`, which guarantees no mutable references
// to the same memory can exist, as that would violate Rust's aliasing rules.
// Pointer is "dereferenceable" by definition as a `u8` is 1 byte and cannot span multiple objects.
// Pointer is "dereferenceable" since caller guarantees it's within bounds of source text `&str`.
// Alignment is not relevant as `u8` is aligned on 1 (i.e. no alignment requirements).
debug_assert!(!self.ptr.is_null());
#[expect(clippy::ptr_as_ptr)]
let p = self.ptr as *const [u8; 2];
let p = self.ptr.cast::<[u8; 2]>();
*p.as_ref().unwrap_unchecked()
}
}

impl PartialEq for SourcePosition<'_> {
#[inline]
fn eq(&self, other: &Self) -> bool {
self.distance_from(*other) == 0
}
}

impl Eq for SourcePosition<'_> {}

impl PartialOrd for SourcePosition<'_> {
#[inline]
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}

impl Ord for SourcePosition<'_> {
#[inline]
fn cmp(&self, other: &Self) -> Ordering {
self.distance_from(*other).cmp(&0)
}
}

/// Return if byte is a UTF-8 continuation byte.
#[inline]
const fn is_utf8_cont_byte(byte: u8) -> bool {
Expand Down
Loading
Loading