Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion crates/oxc_parser/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ doctest = false
[dependencies]
oxc_allocator = { workspace = true }
oxc_ast = { workspace = true }
oxc_data_structures = { workspace = true, features = ["assert_unchecked"] }
oxc_data_structures = { workspace = true, features = ["assert_unchecked", "pointer_ext"] }
oxc_diagnostics = { workspace = true }
oxc_ecmascript = { workspace = true }
oxc_regular_expression = { workspace = true, optional = true }
Expand Down
2 changes: 1 addition & 1 deletion crates/oxc_parser/src/lexer/comment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ impl<'a> Lexer<'a> {
if next_byte == b'*' {
// SAFETY: Next byte is `*` (ASCII) so after it is UTF-8 char boundary
let after_star = unsafe { pos.add(1) };
if after_star.addr() < self.source.end_addr() {
if after_star.is_not_end_of(&self.source) {
// If next byte isn't `/`, continue
// SAFETY: Have checked there's at least 1 further byte to read
if unsafe { after_star.read() } == b'/' {
Expand Down
2 changes: 1 addition & 1 deletion crates/oxc_parser/src/lexer/identifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ impl<'a> Lexer<'a> {
pub fn private_identifier(&mut self) -> Kind {
// Handle EOF directly after `#`
let start_pos = self.source.position();
if start_pos.addr() == self.source.end_addr() {
if start_pos.is_end_of(&self.source) {
return cold_branch(|| {
let start = self.offset();
self.error(diagnostics::unexpected_end(Span::new(start, start)));
Expand Down
9 changes: 4 additions & 5 deletions crates/oxc_parser/src/lexer/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,7 @@ macro_rules! byte_search {
// Silence warnings if macro called in unsafe code
#[allow(unused_unsafe, clippy::unnecessary_safety_comment, clippy::allow_attributes)]
'outer: loop {
let $byte = if $pos.addr() <= $lexer.source.end_for_batch_search_addr() {
let $byte = if $pos.can_read_batch_from(&$lexer.source) {
// Search a batch of `SEARCH_BATCH_SIZE` bytes.
//
// `'inner: loop {}` is not a real loop - it always exits on first turn.
Expand All @@ -447,8 +447,8 @@ macro_rules! byte_search {
// compiler to unroll it.
//
// SAFETY:
// `$pos.addr() <= lexer.source.end_for_batch_search_addr()` check above ensures
// there are at least `SEARCH_BATCH_SIZE` bytes remaining in `lexer.source`.
// `$pos.can_read_batch_from(&$lexer.source)` check above ensures there are
// at least `SEARCH_BATCH_SIZE` bytes remaining in `lexer.source`.
// So calls to `$pos.read()` and `$pos.add(1)` in this loop cannot go out of bounds.
'inner: loop {
for _i in 0..crate::lexer::search::SEARCH_BATCH_SIZE {
Expand All @@ -469,9 +469,8 @@ macro_rules! byte_search {
} else {
// Not enough bytes remaining for a batch. Process byte-by-byte.
// Same as above, `'inner: loop {}` is not a real loop here - always exits on first turn.
let end_addr = $lexer.source.end_addr();
'inner: loop {
while $pos.addr() < end_addr {
while $pos.is_not_end_of(&$lexer.source) {
// SAFETY: `pos` is not at end of source, so safe to read a byte
let byte = unsafe { $pos.read() };
if $table.matches(byte) {
Expand Down
212 changes: 158 additions & 54 deletions crates/oxc_parser/src/lexer/source.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
use std::{marker::PhantomData, slice, str};
use std::{cmp::Ordering, marker::PhantomData, slice, str};

use oxc_data_structures::pointer_ext::PointerExt;

use crate::{MAX_LEN, UniquePromise};

Expand Down Expand Up @@ -107,12 +109,7 @@ impl<'a> Source<'a> {
// SAFETY:
// `start` and `end` are created from a `&str` in `Source::new`, so `start` cannot be after `end`.
// `start` and `end` are by definition on UTF-8 char boundaries.
unsafe {
self.str_between_positions_unchecked(
SourcePosition::new(self.start),
SourcePosition::new(self.end),
)
}
unsafe { self.str_between_positions_unchecked(self.start(), self.end()) }
}

/// Get remaining source text as `&str`.
Expand All @@ -121,31 +118,35 @@ impl<'a> Source<'a> {
// SAFETY:
// Invariant of `Source` is that `ptr` is always <= `end`, and is on a UTF-8 char boundary.
// `end` is pointer to end of original `&str`, so by definition on a UTF-8 char boundary.
unsafe {
self.str_between_positions_unchecked(
SourcePosition::new(self.ptr),
SourcePosition::new(self.end),
)
}
unsafe { self.str_between_positions_unchecked(self.position(), self.end()) }
}

/// Return whether at end of source.
/// Get number of bytes of source text remaining.
#[inline]
pub(super) fn is_eof(&self) -> bool {
self.ptr == self.end
fn remaining_bytes(&self) -> usize {
// SAFETY: Invariant of `Source` is that `ptr` is always <= `end`
unsafe { self.end().offset_from(self.position()) }
}

/// Get [`SourcePosition`] for start of source.
#[inline]
fn start(&self) -> SourcePosition<'a> {
// SAFETY: `start` is the start of source, so in bounds and on a UTF-8 char boundary
unsafe { SourcePosition::new(self.start) }
}

/// Get end address.
/// Get [`SourcePosition`] for end of source.
#[inline]
pub(super) fn end_addr(&self) -> usize {
self.end as usize
fn end(&self) -> SourcePosition<'a> {
// SAFETY: `end` is the end of source, so in bounds and on a UTF-8 char boundary
unsafe { SourcePosition::new(self.end) }
}

/// Get last memory address at which a batch of `Lexer::search::SEARCH_BATCH_SIZE` bytes
/// can be read without going out of bounds.
/// Return whether at end of source.
#[inline]
pub(super) fn end_for_batch_search_addr(&self) -> usize {
self.end_for_batch_search_addr
pub(super) fn is_eof(&self) -> bool {
// TODO: Use `self.remaining_bytes() == 0` instead?
self.ptr == self.end
}

/// Get current position.
Expand Down Expand Up @@ -205,16 +206,16 @@ impl<'a> Source<'a> {
debug_assert!(ascii_byte.is_ascii());
let matched = self.peek_byte() == Some(ascii_byte);
if matched {
// SAFETY: next byte exists and is a valid ASCII char (and thus UTF-8
// char boundary).
// SAFETY: next byte exists and is a valid ASCII char (and thus UTF-8 char boundary).
self.ptr = unsafe { self.ptr.add(1) };
}
matched
}

/// Get string slice from a `SourcePosition` up to the current position of `Source`.
#[inline]
pub(super) fn str_from_pos_to_current(&self, pos: SourcePosition<'a>) -> &'a str {
assert!(pos.ptr <= self.ptr);
assert!(pos <= self.position());
// SAFETY: The above assertion satisfies `str_from_pos_to_current_unchecked`'s requirements
unsafe { self.str_from_pos_to_current_unchecked(pos) }
}
Expand All @@ -231,9 +232,8 @@ impl<'a> Source<'a> {
&self,
pos: SourcePosition<'a>,
) -> &'a str {
// SAFETY: Caller guarantees `pos` is not after current position of `Source`.
// `self.ptr` is always a valid `SourcePosition` due to invariants of `Source`.
unsafe { self.str_between_positions_unchecked(pos, SourcePosition::new(self.ptr)) }
// SAFETY: Caller guarantees `pos` is not after current position of `Source`
unsafe { self.str_between_positions_unchecked(pos, self.position()) }
}

/// Get string slice from current position of `Source` up to a `SourcePosition`, without checks.
Expand All @@ -248,18 +248,16 @@ impl<'a> Source<'a> {
&self,
pos: SourcePosition<'a>,
) -> &'a str {
// SAFETY: Caller guarantees `pos` is not before current position of `Source`.
// `self.ptr` is always a valid `SourcePosition` due to invariants of `Source`.
unsafe { self.str_between_positions_unchecked(SourcePosition::new(self.ptr), pos) }
// SAFETY: Caller guarantees `pos` is not before current position of `Source`
unsafe { self.str_between_positions_unchecked(self.position(), pos) }
}

/// Get string slice from a `SourcePosition` up to the end of `Source`.
#[inline]
pub(super) fn str_from_pos_to_end(&self, pos: SourcePosition<'a>) -> &'a str {
// SAFETY: Invariants of `SourcePosition` is that it cannot be after end of `Source`,
// and always on a UTF-8 character boundary.
// `self.end` is always a valid `SourcePosition` due to invariants of `Source`.
unsafe { self.str_between_positions_unchecked(pos, SourcePosition::new(self.end)) }
// and always on a UTF-8 character boundary
unsafe { self.str_between_positions_unchecked(pos, self.end()) }
}

/// Get string slice of source between 2 `SourcePosition`s, without checks.
Expand Down Expand Up @@ -296,24 +294,36 @@ impl<'a> Source<'a> {
// on UTF-8 character boundaries. So slicing source text between these 2 points will always
// yield a valid UTF-8 string.
unsafe {
let len = end.addr() - start.addr();
let len = end.offset_from(start);
let slice = slice::from_raw_parts(start.ptr, len);
std::str::from_utf8_unchecked(slice)
}
}

/// Get current position in source, relative to start of source.
/// Get current position in source, relative to start of source, as `u32`.
#[inline]
pub(super) fn offset(&self) -> u32 {
self.offset_of(self.position())
}

/// Get offset of `pos`.
#[expect(clippy::cast_possible_truncation)]
/// Get current position in source, relative to start of source, as `usize`.
#[inline]
pub(super) fn offset_usize(&self) -> usize {
self.offset_of_usize(self.position())
}

/// Get offset of `pos` as `u32`.
#[inline]
pub(super) fn offset_of(&self, pos: SourcePosition<'a>) -> u32 {
// Cannot overflow `u32` because of `MAX_LEN` check in `Source::new`
(pos.addr() - self.start as usize) as u32
// SAFETY: All `SourcePosition`s are always in bounds of the source text, which starts at `start`
unsafe { pos.offset_from_u32(self.start()) }
}

/// Get offset of `pos` as `usize`.
#[inline]
pub(super) fn offset_of_usize(&self, pos: SourcePosition<'a>) -> usize {
// SAFETY: All `SourcePosition`s are always in bounds of the source text, which starts at `start`
unsafe { pos.offset_from(self.start()) }
}

/// Move current position back by `n` bytes.
Expand All @@ -333,7 +343,7 @@ impl<'a> Source<'a> {
assert!(n > 0, "Cannot call `Source::back` with 0");

// Ensure not attempting to go back to before start of source
let offset = self.ptr as usize - self.start as usize;
let offset = self.offset_usize();
assert!(n <= offset, "Cannot go back {n} bytes - only {offset} bytes consumed");

// SAFETY: We have checked that `n` is less than distance between `start` and `ptr`,
Expand Down Expand Up @@ -525,11 +535,9 @@ impl<'a> Source<'a> {
/// Peek next two bytes of source without consuming them.
#[inline]
pub(super) fn peek_2_bytes(&self) -> Option<[u8; 2]> {
// `end` is always >= `ptr` so `end - ptr` cannot wrap around.
// No need to use checked/saturating subtraction here.
if (self.end as usize) - (self.ptr as usize) >= 2 {
if self.remaining_bytes() >= 2 {
// SAFETY: The check above ensures that there are at least 2 bytes to
// read from `self.ptr` without reading past `self.end`
// read from current position without reading past end
let bytes = unsafe { self.position().read2() };
Some(bytes)
} else {
Expand Down Expand Up @@ -559,13 +567,13 @@ impl<'a> Source<'a> {
/// # SAFETY
/// `SourcePosition` must always be on a UTF-8 character boundary,
/// and within bounds of the `Source` that created it.
#[derive(Debug, Clone, Copy)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct SourcePosition<'a> {
ptr: *const u8,
_marker: PhantomData<&'a u8>,
}

impl SourcePosition<'_> {
impl<'a> SourcePosition<'a> {
/// Create a new `SourcePosition` from a pointer.
///
/// # SAFETY
Expand All @@ -579,12 +587,6 @@ impl SourcePosition<'_> {
Self { ptr, _marker: PhantomData }
}

/// Get memory address of `SourcePosition` as a `usize`.
#[inline]
pub(super) fn addr(self) -> usize {
self.ptr as usize
}

/// Create new `SourcePosition` which is `n` bytes after this one.
/// The provenance of the pointer `SourcePosition` contains is maintained.
///
Expand All @@ -611,6 +613,64 @@ impl SourcePosition<'_> {
unsafe { Self::new(self.ptr.sub(n)) }
}

/// Get the distance between this [`SourcePosition`] and another [`SourcePosition`] as `usize`.
///
/// # SAFETY
/// `self` must be equal to or after `origin`.
#[inline]
pub(super) unsafe fn offset_from(self, origin: Self) -> usize {
// SAFETY: Caller guarantees `self` is not before `origin`.
// All `SourcePosition<'a>`s are within bounds of same source text.
unsafe { self.ptr.offset_from_usize(origin.ptr) }
}

/// Get the distance between this [`SourcePosition`] and another [`SourcePosition`] as `u32`.
///
/// # SAFETY
/// `self` must be equal to or after `origin`.
#[inline]
pub(super) unsafe fn offset_from_u32(self, origin: Self) -> u32 {
// SAFETY: Caller guarantees `self` is not before `origin`.
let offset = unsafe { self.offset_from(origin) };
// Cannot overflow `u32` because of `MAX_LEN` check in `Source::new`
#[expect(clippy::cast_possible_truncation)]
let offset = offset as u32;
offset
}

/// Get the distance between this [`SourcePosition`] and another [`SourcePosition`] as `isize`.
///
/// Return value will be positive if `self` is after `other`,
/// negative if `self` is before `other`, or 0 if they're the same.
#[inline]
fn offset_from_signed(self, other: Self) -> isize {
// SAFETY: All `SourcePosition<'a>`s are within bounds of same source text
unsafe { self.ptr.offset_from(other.ptr) }
}

/// Returns `true` if this [`SourcePosition`] is at end of `source`.
#[inline]
pub(super) fn is_end_of(self, source: &Source<'a>) -> bool {
// TODO: Use `source.end().offset_from(self) == 0` instead?
self.ptr == source.end
}

/// Returns `true` if this [`SourcePosition`] is not at end of `source`.
#[inline]
pub(super) fn is_not_end_of(self, source: &Source<'a>) -> bool {
!self.is_end_of(source)
}

/// Check if this [`SourcePosition`] is valid for reading `Lexer::search::SEARCH_BATCH_SIZE` bytes
/// from `source`.
/// i.e. is position at least `Lexer::search::SEARCH_BATCH_SIZE` bytes from the end of `source`?
///
/// Returns `true` if safe to read `Lexer::search::SEARCH_BATCH_SIZE` from this position.
#[inline]
pub(super) fn can_read_batch_from(&self, source: &Source<'a>) -> bool {
self.ptr as usize <= source.end_for_batch_search_addr
}

/// Read byte from this `SourcePosition`.
///
/// # SAFETY
Expand Down Expand Up @@ -664,6 +724,50 @@ impl SourcePosition<'_> {
}
}

// Implement `Ord` and `PartialOrd` using `(*const u8)::offset_from` to utilize the invariant
// that `SourcePosition`s are always within the same allocation.
impl Ord for SourcePosition<'_> {
#[inline]
fn cmp(&self, other: &Self) -> Ordering {
let offset = self.offset_from_signed(*other);
#[expect(clippy::comparison_chain)]
if offset < 0 {
Ordering::Less
} else if offset == 0 {
Ordering::Equal
} else {
Ordering::Greater
}
}
}

impl PartialOrd for SourcePosition<'_> {
#[inline]
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}

#[inline]
fn lt(&self, other: &Self) -> bool {
self.offset_from_signed(*other) < 0
}

#[inline]
fn le(&self, other: &Self) -> bool {
self.offset_from_signed(*other) <= 0
}

#[inline]
fn gt(&self, other: &Self) -> bool {
self.offset_from_signed(*other) > 0
}

#[inline]
fn ge(&self, other: &Self) -> bool {
self.offset_from_signed(*other) >= 0
}
}

/// Return if byte is a UTF-8 continuation byte.
#[inline]
const fn is_utf8_cont_byte(byte: u8) -> bool {
Expand Down
Loading
Loading