From 152e59dc60b12875541b18c8aae6b400ff15589a Mon Sep 17 00:00:00 2001 From: overlookmotel <557937+overlookmotel@users.noreply.github.com> Date: Wed, 9 Jul 2025 09:53:37 +0000 Subject: [PATCH] feat(napi/oxlint): read source text into start of allocator (#12122) In `napi/oxlint2`, read source text into start of allocator, instead of end. This is what raw transfer needs. The implementation here is a complete dog's dinner! In order to inject this code into `oxlint` library code without affecting the behavior of existing `oxlint` app, it uses `oxlint2` feature to changes the allocator's behavior. There is also way too much unsafe code for comfort. Much of the code in this PR can be removed again when we replace `bumpalo` with our own allocator, because that allocator will allocate all strings at start of the arena by default anyway. So I propose that we accept this mess as a temporary solution just to get us to a working prototype, and then clean it up in next couple of weeks. --- Cargo.lock | 1 + apps/oxlint/Cargo.toml | 5 +- apps/oxlint/src/lib.rs | 3 + apps/oxlint/src/lint.rs | 9 ++ apps/oxlint/src/raw_fs.rs | 198 +++++++++++++++++++++++++ crates/oxc_allocator/src/fixed_size.rs | 2 +- crates/oxc_allocator/src/pool.rs | 17 ++- 7 files changed, 231 insertions(+), 4 deletions(-) create mode 100644 apps/oxlint/src/raw_fs.rs diff --git a/Cargo.lock b/Cargo.lock index 24fc5c6030e1b..f0b89207cc9c8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2437,6 +2437,7 @@ dependencies = [ "rustc-hash", "serde", "serde_json", + "simdutf8", "tempfile", "tracing-subscriber", ] diff --git a/apps/oxlint/Cargo.toml b/apps/oxlint/Cargo.toml index 5a4cc9c84dc7f..8f85ddc5ee120 100644 --- a/apps/oxlint/Cargo.toml +++ b/apps/oxlint/Cargo.toml @@ -41,6 +41,7 @@ rayon = { workspace = true } rustc-hash = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } +simdutf8 = { workspace = true, optional = true } tempfile = { workspace = true } tracing-subscriber = { workspace = true, features = [] } # Omit the `regex` feature @@ -60,6 +61,6 @@ lazy-regex = { workspace = true } [features] default = [] allocator = ["dep:mimalloc-safe"] -oxlint2 = ["oxc_linter/oxlint2"] -disable_oxlint2 = ["oxc_linter/disable_oxlint2"] +oxlint2 = ["oxc_linter/oxlint2", "oxc_allocator/fixed_size", "dep:simdutf8"] +disable_oxlint2 = ["oxc_linter/disable_oxlint2", "oxc_allocator/disable_fixed_size"] force_test_reporter = ["oxc_linter/force_test_reporter"] diff --git a/apps/oxlint/src/lib.rs b/apps/oxlint/src/lib.rs index dc111fa028431..efb78dce0c90a 100644 --- a/apps/oxlint/src/lib.rs +++ b/apps/oxlint/src/lib.rs @@ -14,6 +14,9 @@ pub use oxc_linter::{ ExternalLinter, ExternalLinterCb, ExternalLinterLoadPluginCb, PluginLoadResult, }; +#[cfg(all(feature = "oxlint2", not(feature = "disable_oxlint2")))] +mod raw_fs; + #[cfg(all(feature = "allocator", not(miri), not(target_family = "wasm")))] #[global_allocator] static GLOBAL: mimalloc_safe::MiMalloc = mimalloc_safe::MiMalloc; diff --git a/apps/oxlint/src/lint.rs b/apps/oxlint/src/lint.rs index 4058b8f1a4c07..cbf9b6fce4789 100644 --- a/apps/oxlint/src/lint.rs +++ b/apps/oxlint/src/lint.rs @@ -308,6 +308,15 @@ impl Runner for LintRunner { // Spawn linting in another thread so diagnostics can be printed immediately from diagnostic_service.run. rayon::spawn(move || { let mut lint_service = LintService::new(&linter, allocator_pool, options); + + // Use `RawTransferFileSystem` if `oxlint2` feature is enabled. + // This reads the source text into start of allocator, instead of the end. + #[cfg(all(feature = "oxlint2", not(feature = "disable_oxlint2")))] + { + use crate::raw_fs::RawTransferFileSystem; + lint_service = lint_service.with_file_system(Box::new(RawTransferFileSystem)); + } + lint_service.run(&tx_error); }); diff --git a/apps/oxlint/src/raw_fs.rs b/apps/oxlint/src/raw_fs.rs new file mode 100644 index 0000000000000..f40893fde18c5 --- /dev/null +++ b/apps/oxlint/src/raw_fs.rs @@ -0,0 +1,198 @@ +use std::{ + fs::{self, File}, + io::{self, Read}, + mem::ManuallyDrop, + path::Path, + ptr, slice, +}; + +use oxc_allocator::Allocator; +use oxc_linter::RuntimeFileSystem; + +/// File system used by `oxlint2`. +/// +/// Identical to `OsFileSystem`, except that `read_to_arena_str` reads the file's contents into +/// start of the allocator, instead of the end. This conforms to what raw transfer needs. +/// +/// Must only be used in conjunction with `AllocatorPool` with `fixed_size` feature enabled, +/// which wraps `Allocator`s with a custom `Drop` impl, which makes `read_to_arena_str` safe. +/// +/// This is a temporary solution. When we replace `bumpalo` with our own allocator, all strings +/// will be written at start of the arena, so then `OsFileSystem` will work fine, and we can +/// remove `RawTransferFileSystem`. TODO: Do that! +pub struct RawTransferFileSystem; + +impl RuntimeFileSystem for RawTransferFileSystem { + /// Read file from disk into start of `allocator`. + /// + /// # SAFETY + /// `allocator` must not be dropped after calling this method. + /// See [`Allocator::alloc_bytes_start`] for more details. + /// + /// This should be an unsafe method, but can't because we're implementing a safe trait method. + fn read_to_arena_str<'a>( + &self, + path: &Path, + allocator: &'a Allocator, + ) -> Result<&'a str, std::io::Error> { + // SAFETY: Caller promises not to allow `allocator` to be dropped + unsafe { read_to_arena_str(path, allocator) } + } + + fn write_file(&self, path: &Path, content: &str) -> Result<(), std::io::Error> { + fs::write(path, content) + } +} + +/// Read the contents of a UTF-8 encoded file directly into arena allocator, +/// at the *start* of the arena, instead of the end. +/// +/// Avoids intermediate allocations if file size is known in advance. +/// +/// This function opens the file at `path`, reads its entire contents into memory +/// allocated from the given [`Allocator`], validates that the bytes are valid UTF-8, +/// and returns a borrowed `&str` pointing to the allocator-backed data. +/// +/// This is useful for performance-critical workflows where zero-copy string handling is desired, +/// such as parsing large source files in memory-constrained or throughput-sensitive environments. +/// +/// # Parameters +/// +/// - `path`: The path to the file to read. +/// - `allocator`: The [`Allocator`] into which the file contents will be allocated. +/// +/// # Errors +/// +/// Returns [`io::Error`] if any of: +/// +/// - The file cannot be read. +/// - The file's contents are not valid UTF-8. +/// - The file's size exceeds the capacity of `allocator`. +/// +/// # SAFETY +/// `allocator` must not be dropped after calling this method. +/// See [`Allocator::alloc_bytes_start`] for more details. +unsafe fn read_to_arena_str<'alloc>( + path: &Path, + allocator: &'alloc Allocator, +) -> io::Result<&'alloc str> { + let file = File::open(path)?; + + let bytes = if let Ok(metadata) = file.metadata() { + // SAFETY: Caller guarantees `allocator` is not dropped after calling this method + unsafe { read_to_arena_bytes_known_size(file, metadata.len(), allocator) } + } else { + // SAFETY: Caller guarantees `allocator` is not dropped after calling this method + unsafe { read_to_arena_bytes_unknown_size(file, allocator) } + }?; + + // Convert to `&str`, checking contents is valid UTF-8 + simdutf8::basic::from_utf8(bytes).map_err(|_| { + io::Error::new(io::ErrorKind::InvalidData, "stream did not contain valid UTF-8") + }) +} + +/// Read contents of file directly into arena. +/// +/// # SAFETY +/// `allocator` must not be dropped after calling this method. +/// See [`Allocator::alloc_bytes_start`] for more details. +unsafe fn read_to_arena_bytes_known_size( + file: File, + size: u64, + allocator: &Allocator, +) -> io::Result<&[u8]> { + // Check file is not larger than `usize::MAX` bytes. + // Note: We don't need to check `size` is not larger than `isize::MAX` bytes here, + // because `Allocator::alloc_bytes_start` does a size check. + let Ok(mut size) = usize::try_from(size) else { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "File is larger than `usize::MAX` bytes", + )); + }; + + // Allocate space for string in allocator. + // SAFETY: Caller guarantees `allocator` is not dropped after calling this method. + let ptr = unsafe { allocator.alloc_bytes_start(size) }; + + // Read contents of file into allocated space. + // + // * Create a `Vec` which pretends to own the allocation we just created in arena. + // * Wrap the `Vec` in `ManuallyDrop`, so it doesn't free the memory at end of the block, + // or if there's a panic during reading. + // * Use `File::take` to obtain a reader which yields no more than `size` bytes. + // This ensures it can't produce more data than we allocated space for - in case file increased + // in size since the call to `file.metadata()`, or `file.metadata()` returned inaccurate size. + // * Use `Read::read_to_end` to fill the `Vec` from this reader. + // + // This is a hack. It's completely bananas that Rust doesn't provide a method to write into + // a slice of uninitialized bytes, but this seems to be the only safe way to do it on stable Rust. + // https://users.rust-lang.org/t/reading-c-style-structures-from-disk/70529/7 + // + // I (@overlookmotel) have reviewed the code for `Read::read_to_end` and it will never grow the `Vec`, + // as long as it has sufficient capacity for the reader's contents to start with. + // If it did, that would be UB as it would free the chunk of memory backing the `Vec`, + // which it doesn't actually own. + // + // Unfortunately `Read::read_to_end`'s docs don't guarantee this behavior. But the code is written + // specifically to avoid growing the `Vec`, and there was a PR to make sure it doesn't: + // https://github.com/rust-lang/rust/pull/89165 + // So I think in practice we can rely on this behavior. + { + // SAFETY: We've allocated `size` bytes starting at `ptr`. + // This `Vec` doesn't actually own that memory, but we immediately wrap it in `ManuallyDrop`, + // so it won't free the memory on drop. As long as the `Vec` doesn't grow, no UB (see above). + let vec = unsafe { Vec::from_raw_parts(ptr.as_ptr(), 0, size) }; + let mut vec = ManuallyDrop::new(vec); + let bytes_written = file.take(size as u64).read_to_end(&mut vec)?; + + debug_assert!(vec.capacity() == size); + debug_assert!(vec.len() == bytes_written); + + // Update `size`, in case file was altered and got smaller since the call to `file.metadata()`, + // or `file.metadata()` reported inaccurate size + size = vec.len(); + } + + // SAFETY: `size` bytes were written starting at `ptr`. + // Those bytes will remain untouched until the `Allocator` is reset, so returning a `&[u8]` with + // same lifetime as the `&Allocator` borrow is valid. + let bytes = unsafe { slice::from_raw_parts(ptr.as_ptr(), size) }; + Ok(bytes) +} + +/// Fallback for when file size is unknown. +/// Read file contents into a `Vec`, and then copy into arena. +/// +/// # SAFETY +/// `allocator` must not be dropped after calling this method. +/// See [`Allocator::alloc_bytes_start`] for more details. +unsafe fn read_to_arena_bytes_unknown_size( + mut file: File, + allocator: &Allocator, +) -> io::Result<&[u8]> { + // Read file into a `Vec` + let mut bytes = Vec::new(); + file.read_to_end(&mut bytes)?; + + // Copy bytes into start of allocator chunk. + // + // SAFETY: + // * `alloc_bytes_start` allocates space for `len` bytes at start of the arena chunk. + // That allocation cannot overlap the allocation owned by `bytes` vec. + // * After `copy_nonoverlapping` call, `len` bytes starting from `dst` are initialized, + // so safe to create a byte slice referencing those bytes. + // * Those bytes will remain untouched until the `Allocator` is reset, so returning a `&[u8]` with + // same lifetime as the `&Allocator` borrow is valid. + // * Caller guarantees `allocator` is not dropped after calling this method. + let slice = unsafe { + let src = bytes.as_ptr(); + let len = bytes.len(); + let dst = allocator.alloc_bytes_start(len).as_ptr(); + ptr::copy_nonoverlapping(src, dst, len); + slice::from_raw_parts(dst, len) + }; + + Ok(slice) +} diff --git a/crates/oxc_allocator/src/fixed_size.rs b/crates/oxc_allocator/src/fixed_size.rs index 4c536a3866161..6fa043e6968ee 100644 --- a/crates/oxc_allocator/src/fixed_size.rs +++ b/crates/oxc_allocator/src/fixed_size.rs @@ -34,7 +34,7 @@ const FOUR_GIB: usize = 1 << (IS_SUPPORTED_PLATFORM as usize * 32); const ALLOC_SIZE: usize = FOUR_GIB; const ALLOC_ALIGN: usize = TWO_GIB; const CHUNK_SIZE: usize = TWO_GIB; -const CHUNK_ALIGN: usize = FOUR_GIB; +pub const CHUNK_ALIGN: usize = FOUR_GIB; const ALLOC_LAYOUT: Layout = match Layout::from_size_align(ALLOC_SIZE, ALLOC_ALIGN) { Ok(layout) => layout, diff --git a/crates/oxc_allocator/src/pool.rs b/crates/oxc_allocator/src/pool.rs index 2459abe2b9dec..a8fb135914ad7 100644 --- a/crates/oxc_allocator/src/pool.rs +++ b/crates/oxc_allocator/src/pool.rs @@ -107,7 +107,10 @@ mod wrapper { #[cfg(all(feature = "fixed_size", not(feature = "disable_fixed_size")))] mod wrapper { - use crate::{Allocator, fixed_size::FixedSizeAllocator}; + use crate::{ + Allocator, + fixed_size::{CHUNK_ALIGN, FixedSizeAllocator}, + }; /// Structure which wraps an [`Allocator`] with fixed size of 2 GiB, and aligned on 4 GiB. /// @@ -127,7 +130,19 @@ mod wrapper { /// Reset the [`Allocator`] in this [`AllocatorWrapper`]. pub fn reset(&mut self) { + // Set cursor back to end self.0.reset(); + + // Set data pointer back to start. + // SAFETY: Fixed-size allocators have data pointer originally aligned on `CHUNK_ALIGN`, + // and size less than `CHUNK_ALIGN`. So we can restore original data pointer by rounding down + // to next multiple of `CHUNK_ALIGN`. + unsafe { + let data_ptr = self.0.data_ptr(); + let offset = data_ptr.as_ptr() as usize % CHUNK_ALIGN; + let data_ptr = data_ptr.sub(offset); + self.0.set_data_ptr(data_ptr); + } } /// Create a `Vec` of [`AllocatorWrapper`]s.