diff --git a/rust/arrow/src/alloc/alignment.rs b/rust/arrow/src/alloc/alignment.rs new file mode 100644 index 00000000000..dbf4602f83a --- /dev/null +++ b/rust/arrow/src/alloc/alignment.rs @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// NOTE: Below code is written for spatial/temporal prefetcher optimizations. Memory allocation +// should align well with usage pattern of cache access and block sizes on layers of storage levels from +// registers to non-volatile memory. These alignments are all cache aware alignments incorporated +// from [cuneiform](https://crates.io/crates/cuneiform) crate. This approach mimicks Intel TBB's +// cache_aligned_allocator which exploits cache locality and minimizes prefetch signals +// resulting in less round trip time between the layers of storage. +// For further info: https://software.intel.com/en-us/node/506094 + +// 32-bit architecture and things other than netburst microarchitecture are using 64 bytes. +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "x86")] +pub const ALIGNMENT: usize = 1 << 6; + +// Intel x86_64: +// L2D streamer from L1: +// Loads data or instructions from memory to the second-level cache. To use the streamer, +// organize the data or instructions in blocks of 128 bytes, aligned on 128 bytes. +// - https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "x86_64")] +pub const ALIGNMENT: usize = 1 << 7; + +// 24Kc: +// Data Line Size +// - https://s3-eu-west-1.amazonaws.com/downloads-mips/documents/MD00346-2B-24K-DTS-04.00.pdf +// - https://gitlab.e.foundation/e/devices/samsung/n7100/stable_android_kernel_samsung_smdk4412/commit/2dbac10263b2f3c561de68b4c369bc679352ccee +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "mips")] +pub const ALIGNMENT: usize = 1 << 5; +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "mips64")] +pub const ALIGNMENT: usize = 1 << 5; + +// Defaults for powerpc +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "powerpc")] +pub const ALIGNMENT: usize = 1 << 5; + +// Defaults for the ppc 64 +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "powerpc64")] +pub const ALIGNMENT: usize = 1 << 6; + +// e.g.: sifive +// - https://github.com/torvalds/linux/blob/master/Documentation/devicetree/bindings/riscv/sifive-l2-cache.txt#L41 +// in general all of them are the same. +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "riscv")] +pub const ALIGNMENT: usize = 1 << 6; + +// This size is same across all hardware for this architecture. +// - https://docs.huihoo.com/doxygen/linux/kernel/3.7/arch_2s390_2include_2asm_2cache_8h.html +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "s390x")] +pub const ALIGNMENT: usize = 1 << 8; + +// This size is same across all hardware for this architecture. +// - https://docs.huihoo.com/doxygen/linux/kernel/3.7/arch_2sparc_2include_2asm_2cache_8h.html#a9400cc2ba37e33279bdbc510a6311fb4 +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "sparc")] +pub const ALIGNMENT: usize = 1 << 5; +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "sparc64")] +pub const ALIGNMENT: usize = 1 << 6; + +// On ARM cache line sizes are fixed. both v6 and v7. +// Need to add board specific or platform specific things later. +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "thumbv6")] +pub const ALIGNMENT: usize = 1 << 5; +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "thumbv7")] +pub const ALIGNMENT: usize = 1 << 5; + +// Operating Systems cache size determines this. +// Currently no way to determine this without runtime inference. +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "wasm32")] +pub const ALIGNMENT: usize = 1 << 6; + +// Same as v6 and v7. +// List goes like that: +// Cortex A, M, R, ARM v7, v7-M, Krait and NeoverseN uses this size. +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "arm")] +pub const ALIGNMENT: usize = 1 << 5; + +// Combined from 4 sectors. Volta says 128. +// Prevent chunk optimizations better to go to the default size. +// If you have smaller data with less padded functionality then use 32 with force option. +// - https://devtalk.nvidia.com/default/topic/803600/variable-cache-line-width-/ +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "nvptx")] +pub const ALIGNMENT: usize = 1 << 7; +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "nvptx64")] +pub const ALIGNMENT: usize = 1 << 7; + +// This size is same across all hardware for this architecture. +/// Cache and allocation multiple alignment size +#[cfg(target_arch = "aarch64")] +pub const ALIGNMENT: usize = 1 << 6; diff --git a/rust/arrow/src/alloc/mod.rs b/rust/arrow/src/alloc/mod.rs new file mode 100644 index 00000000000..a225d32dd82 --- /dev/null +++ b/rust/arrow/src/alloc/mod.rs @@ -0,0 +1,136 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Defines memory-related functions, such as allocate/deallocate/reallocate memory +//! regions, cache and allocation alignments. + +use std::mem::size_of; +use std::ptr::NonNull; +use std::{ + alloc::{handle_alloc_error, Layout}, + sync::atomic::AtomicIsize, +}; + +mod alignment; +mod types; + +pub use alignment::ALIGNMENT; +pub use types::NativeType; + +// If this number is not zero after all objects have been `drop`, there is a memory leak +pub static mut ALLOCATIONS: AtomicIsize = AtomicIsize::new(0); + +#[inline] +unsafe fn null_pointer() -> NonNull { + NonNull::new_unchecked(ALIGNMENT as *mut T) +} + +/// Allocates a cache-aligned memory region of `size` bytes with uninitialized values. +/// This is more performant than using [allocate_aligned_zeroed] when all bytes will have +/// an unknown or non-zero value and is semantically similar to `malloc`. +pub fn allocate_aligned(size: usize) -> NonNull { + unsafe { + if size == 0 { + null_pointer() + } else { + let size = size * size_of::(); + ALLOCATIONS.fetch_add(size as isize, std::sync::atomic::Ordering::SeqCst); + + let layout = Layout::from_size_align_unchecked(size, ALIGNMENT); + let raw_ptr = std::alloc::alloc(layout) as *mut T; + NonNull::new(raw_ptr).unwrap_or_else(|| handle_alloc_error(layout)) + } + } +} + +/// Allocates a cache-aligned memory region of `size` bytes with `0` on all of them. +/// This is more performant than using [allocate_aligned] and setting all bytes to zero +/// and is semantically similar to `calloc`. +pub fn allocate_aligned_zeroed(size: usize) -> NonNull { + unsafe { + if size == 0 { + null_pointer() + } else { + let size = size * size_of::(); + ALLOCATIONS.fetch_add(size as isize, std::sync::atomic::Ordering::SeqCst); + + let layout = Layout::from_size_align_unchecked(size, ALIGNMENT); + let raw_ptr = std::alloc::alloc_zeroed(layout) as *mut T; + NonNull::new(raw_ptr).unwrap_or_else(|| handle_alloc_error(layout)) + } + } +} + +/// # Safety +/// +/// This function is unsafe because undefined behavior can result if the caller does not ensure all +/// of the following: +/// +/// * ptr must denote a block of memory currently allocated via this allocator, +/// +/// * size must be the same size that was used to allocate that block of memory, +pub unsafe fn free_aligned(ptr: NonNull, size: usize) { + if ptr != null_pointer() { + let size = size * size_of::(); + ALLOCATIONS.fetch_sub(size as isize, std::sync::atomic::Ordering::SeqCst); + std::alloc::dealloc( + ptr.as_ptr() as *mut u8, + Layout::from_size_align_unchecked(size, ALIGNMENT), + ); + } +} + +/// # Safety +/// +/// This function is unsafe because undefined behavior can result if the caller does not ensure all +/// of the following: +/// +/// * ptr must be currently allocated via this allocator, +/// +/// * new_size must be greater than zero. +/// +/// * new_size, when rounded up to the nearest multiple of [ALIGNMENT], must not overflow (i.e., +/// the rounded value must be less than usize::MAX). +pub unsafe fn reallocate( + ptr: NonNull, + old_size: usize, + new_size: usize, +) -> NonNull { + let old_size = old_size * size_of::(); + let new_size = new_size * size_of::(); + if ptr == null_pointer() { + return allocate_aligned(new_size); + } + + if new_size == 0 { + free_aligned(ptr, old_size); + return null_pointer(); + } + + ALLOCATIONS.fetch_add( + new_size as isize - old_size as isize, + std::sync::atomic::Ordering::SeqCst, + ); + let raw_ptr = std::alloc::realloc( + ptr.as_ptr() as *mut u8, + Layout::from_size_align_unchecked(old_size, ALIGNMENT), + new_size, + ) as *mut T; + NonNull::new(raw_ptr).unwrap_or_else(|| { + handle_alloc_error(Layout::from_size_align_unchecked(new_size, ALIGNMENT)) + }) +} diff --git a/rust/arrow/src/alloc/types.rs b/rust/arrow/src/alloc/types.rs new file mode 100644 index 00000000000..0e177da7db8 --- /dev/null +++ b/rust/arrow/src/alloc/types.rs @@ -0,0 +1,175 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::datatypes::DataType; + +/// A type that Rust's custom allocator knows how to allocate and deallocate. +/// This is implemented for all Arrow's physical types whose in-memory representation +/// matches Rust's physical types. Consider this trait sealed. +/// # Safety +/// Do not implement this trait. +pub unsafe trait NativeType: + Sized + Copy + std::fmt::Debug + std::fmt::Display + PartialEq + Default + Sized + 'static +{ + type Bytes: AsRef<[u8]>; + + /// Whether a DataType is a valid type for this physical representation. + fn is_valid(data_type: &DataType) -> bool; + + /// How this type represents itself as bytes in little endianess. + /// This is used for IPC, where data is communicated with a specific endianess. + fn to_le_bytes(&self) -> Self::Bytes; +} + +unsafe impl NativeType for u8 { + type Bytes = [u8; std::mem::size_of::()]; + #[inline] + fn to_le_bytes(&self) -> Self::Bytes { + Self::to_le_bytes(*self) + } + + #[inline] + fn is_valid(data_type: &DataType) -> bool { + data_type == &DataType::UInt8 + } +} + +unsafe impl NativeType for u16 { + type Bytes = [u8; std::mem::size_of::()]; + #[inline] + fn to_le_bytes(&self) -> Self::Bytes { + Self::to_le_bytes(*self) + } + + #[inline] + fn is_valid(data_type: &DataType) -> bool { + data_type == &DataType::UInt16 + } +} + +unsafe impl NativeType for u32 { + type Bytes = [u8; std::mem::size_of::()]; + #[inline] + fn to_le_bytes(&self) -> Self::Bytes { + Self::to_le_bytes(*self) + } + + #[inline] + fn is_valid(data_type: &DataType) -> bool { + data_type == &DataType::UInt32 + } +} + +unsafe impl NativeType for u64 { + type Bytes = [u8; std::mem::size_of::()]; + #[inline] + fn to_le_bytes(&self) -> Self::Bytes { + Self::to_le_bytes(*self) + } + + #[inline] + fn is_valid(data_type: &DataType) -> bool { + data_type == &DataType::UInt64 + } +} + +unsafe impl NativeType for i8 { + type Bytes = [u8; std::mem::size_of::()]; + #[inline] + fn to_le_bytes(&self) -> Self::Bytes { + Self::to_le_bytes(*self) + } + + #[inline] + fn is_valid(data_type: &DataType) -> bool { + data_type == &DataType::Int8 + } +} + +unsafe impl NativeType for i16 { + type Bytes = [u8; std::mem::size_of::()]; + #[inline] + fn to_le_bytes(&self) -> Self::Bytes { + Self::to_le_bytes(*self) + } + + #[inline] + fn is_valid(data_type: &DataType) -> bool { + data_type == &DataType::Int16 + } +} + +unsafe impl NativeType for i32 { + type Bytes = [u8; std::mem::size_of::()]; + #[inline] + fn to_le_bytes(&self) -> Self::Bytes { + Self::to_le_bytes(*self) + } + + #[inline] + fn is_valid(data_type: &DataType) -> bool { + matches!( + data_type, + DataType::Int32 | DataType::Date32 | DataType::Time32(_) + ) + } +} + +unsafe impl NativeType for i64 { + type Bytes = [u8; std::mem::size_of::()]; + #[inline] + fn to_le_bytes(&self) -> Self::Bytes { + Self::to_le_bytes(*self) + } + + #[inline] + fn is_valid(data_type: &DataType) -> bool { + matches!( + data_type, + DataType::Int64 + | DataType::Date64 + | DataType::Time64(_) + | DataType::Timestamp(_, _) + ) + } +} + +unsafe impl NativeType for f32 { + type Bytes = [u8; std::mem::size_of::()]; + #[inline] + fn to_le_bytes(&self) -> Self::Bytes { + Self::to_le_bytes(*self) + } + + #[inline] + fn is_valid(data_type: &DataType) -> bool { + data_type == &DataType::Float32 + } +} + +unsafe impl NativeType for f64 { + type Bytes = [u8; std::mem::size_of::()]; + #[inline] + fn to_le_bytes(&self) -> Self::Bytes { + Self::to_le_bytes(*self) + } + + #[inline] + fn is_valid(data_type: &DataType) -> bool { + data_type == &DataType::Float64 + } +} diff --git a/rust/arrow/src/array/array_list.rs b/rust/arrow/src/array/array_list.rs index 8458836bfd6..f2076b3e86d 100644 --- a/rust/arrow/src/array/array_list.rs +++ b/rust/arrow/src/array/array_list.rs @@ -378,12 +378,12 @@ impl fmt::Debug for FixedSizeListArray { #[cfg(test)] mod tests { use crate::{ + alloc, array::ArrayData, array::Int32Array, buffer::Buffer, datatypes::Field, datatypes::{Int32Type, ToByteSlice}, - memory, util::bit_util, }; @@ -993,7 +993,7 @@ mod tests { #[test] #[should_panic(expected = "memory is not aligned")] fn test_primitive_array_alignment() { - let ptr = memory::allocate_aligned(8); + let ptr = alloc::allocate_aligned::(8); let buf = unsafe { Buffer::from_raw_parts(ptr, 8, 8) }; let buf2 = buf.slice(1); let array_data = ArrayData::builder(DataType::Int32).add_buffer(buf2).build(); @@ -1003,7 +1003,7 @@ mod tests { #[test] #[should_panic(expected = "memory is not aligned")] fn test_list_array_alignment() { - let ptr = memory::allocate_aligned(8); + let ptr = alloc::allocate_aligned::(8); let buf = unsafe { Buffer::from_raw_parts(ptr, 8, 8) }; let buf2 = buf.slice(1); diff --git a/rust/arrow/src/array/raw_pointer.rs b/rust/arrow/src/array/raw_pointer.rs index 897dc5b591c..185e1cbe98a 100644 --- a/rust/arrow/src/array/raw_pointer.rs +++ b/rust/arrow/src/array/raw_pointer.rs @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. -use crate::memory; use std::ptr::NonNull; /// This struct is highly `unsafe` and offers the possibility to self-reference a [arrow::Buffer] from [arrow::array::ArrayData]. @@ -36,7 +35,11 @@ impl RawPtrBox { /// * `ptr` is not aligned to a slice of type `T`. This is guaranteed if it was built from a slice of type `T`. pub(super) unsafe fn new(ptr: *const u8) -> Self { let ptr = NonNull::new(ptr as *mut u8).expect("Pointer cannot be null"); - assert!(memory::is_ptr_aligned::(ptr), "memory is not aligned"); + assert_eq!( + ptr.as_ptr().align_offset(std::mem::align_of::()), + 0, + "memory is not aligned" + ); Self { ptr: ptr.cast() } } diff --git a/rust/arrow/src/buffer/immutable.rs b/rust/arrow/src/buffer/immutable.rs index e96bc003c8b..c09e4ddc48a 100644 --- a/rust/arrow/src/buffer/immutable.rs +++ b/rust/arrow/src/buffer/immutable.rs @@ -21,9 +21,7 @@ use std::ptr::NonNull; use std::sync::Arc; use std::{convert::AsRef, usize}; -use crate::memory; use crate::util::bit_chunk_iterator::BitChunks; -use crate::util::bit_util; use crate::{ bytes::{Bytes, Deallocation}, datatypes::ArrowNativeType, @@ -56,19 +54,11 @@ impl Buffer { /// Initializes a [Buffer] from a slice of items. pub fn from_slice_ref>(items: &T) -> Self { - // allocate aligned memory buffer let slice = items.as_ref(); - let len = slice.len() * std::mem::size_of::(); - let capacity = bit_util::round_upto_multiple_of_64(len); - let buffer = memory::allocate_aligned(capacity); - unsafe { - memory::memcpy( - buffer, - NonNull::new_unchecked(slice.as_ptr() as *mut u8), - len, - ); - Buffer::build_with_arguments(buffer, len, Deallocation::Native(capacity)) - } + let len = slice.len(); + let mut buffer = MutableBuffer::with_capacity(len); + buffer.extend_from_slice(slice); + buffer.into() } /// Creates a buffer from an existing memory region (must already be byte-aligned), this diff --git a/rust/arrow/src/buffer/mutable.rs b/rust/arrow/src/buffer/mutable.rs index 9f0238f9d99..ddc0501f466 100644 --- a/rust/arrow/src/buffer/mutable.rs +++ b/rust/arrow/src/buffer/mutable.rs @@ -1,9 +1,9 @@ use std::ptr::NonNull; use crate::{ + alloc, bytes::{Bytes, Deallocation}, datatypes::{ArrowNativeType, ToByteSlice}, - memory, util::bit_util, }; @@ -53,8 +53,14 @@ impl MutableBuffer { /// Allocate a new [MutableBuffer] with initial capacity to be at least `capacity`. #[inline] pub fn new(capacity: usize) -> Self { + Self::with_capacity(capacity) + } + + /// Allocate a new [MutableBuffer] with initial capacity to be at least `capacity`. + #[inline] + pub fn with_capacity(capacity: usize) -> Self { let capacity = bit_util::round_upto_multiple_of_64(capacity); - let ptr = memory::allocate_aligned(capacity); + let ptr = alloc::allocate_aligned(capacity); Self { data: ptr, len: 0, @@ -75,7 +81,7 @@ impl MutableBuffer { /// ``` pub fn from_len_zeroed(len: usize) -> Self { let new_capacity = bit_util::round_upto_multiple_of_64(len); - let ptr = memory::allocate_aligned_zeroed(new_capacity); + let ptr = alloc::allocate_aligned_zeroed(new_capacity); Self { data: ptr, len, @@ -324,7 +330,7 @@ unsafe fn reallocate( ) -> (NonNull, usize) { let new_capacity = bit_util::round_upto_multiple_of_64(new_capacity); let new_capacity = std::cmp::max(new_capacity, old_capacity * 2); - let ptr = memory::reallocate(ptr, old_capacity, new_capacity); + let ptr = alloc::reallocate(ptr, old_capacity, new_capacity); (ptr, new_capacity) } @@ -460,7 +466,7 @@ impl std::ops::DerefMut for MutableBuffer { impl Drop for MutableBuffer { fn drop(&mut self) { - unsafe { memory::free_aligned(self.data, self.capacity) }; + unsafe { alloc::free_aligned(self.data, self.capacity) }; } } diff --git a/rust/arrow/src/bytes.rs b/rust/arrow/src/bytes.rs index 323654954f8..38fa4439b42 100644 --- a/rust/arrow/src/bytes.rs +++ b/rust/arrow/src/bytes.rs @@ -24,7 +24,7 @@ use std::ptr::NonNull; use std::sync::Arc; use std::{fmt::Debug, fmt::Formatter}; -use crate::{ffi, memory}; +use crate::{alloc, ffi}; /// Mode of deallocating memory regions pub enum Deallocation { @@ -126,7 +126,7 @@ impl Drop for Bytes { fn drop(&mut self) { match &self.deallocation { Deallocation::Native(capacity) => { - unsafe { memory::free_aligned(self.ptr, *capacity) }; + unsafe { alloc::free_aligned::(self.ptr, *capacity) }; } // foreign interface knows how to deallocate itself. Deallocation::Foreign(_) => (), diff --git a/rust/arrow/src/lib.rs b/rust/arrow/src/lib.rs index c082d6136e2..9c2ca2723ce 100644 --- a/rust/arrow/src/lib.rs +++ b/rust/arrow/src/lib.rs @@ -135,6 +135,7 @@ // introduced to ignore lint errors when upgrading from 2020-04-22 to 2020-11-14 #![allow(clippy::float_equality_without_abs, clippy::type_complexity)] +mod alloc; mod arch; pub mod array; pub mod bitmap; @@ -147,7 +148,6 @@ pub mod error; pub mod ffi; pub mod ipc; pub mod json; -pub mod memory; pub mod record_batch; pub mod temporal_conversions; pub mod tensor; diff --git a/rust/arrow/src/memory.rs b/rust/arrow/src/memory.rs deleted file mode 100644 index 0ea8845decc..00000000000 --- a/rust/arrow/src/memory.rs +++ /dev/null @@ -1,277 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines memory-related functions, such as allocate/deallocate/reallocate memory -//! regions, cache and allocation alignments. - -use std::mem::align_of; -use std::ptr::NonNull; -use std::{ - alloc::{handle_alloc_error, Layout}, - sync::atomic::AtomicIsize, -}; - -// NOTE: Below code is written for spatial/temporal prefetcher optimizations. Memory allocation -// should align well with usage pattern of cache access and block sizes on layers of storage levels from -// registers to non-volatile memory. These alignments are all cache aware alignments incorporated -// from [cuneiform](https://crates.io/crates/cuneiform) crate. This approach mimicks Intel TBB's -// cache_aligned_allocator which exploits cache locality and minimizes prefetch signals -// resulting in less round trip time between the layers of storage. -// For further info: https://software.intel.com/en-us/node/506094 - -// 32-bit architecture and things other than netburst microarchitecture are using 64 bytes. -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "x86")] -pub const ALIGNMENT: usize = 1 << 6; - -// Intel x86_64: -// L2D streamer from L1: -// Loads data or instructions from memory to the second-level cache. To use the streamer, -// organize the data or instructions in blocks of 128 bytes, aligned on 128 bytes. -// - https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "x86_64")] -pub const ALIGNMENT: usize = 1 << 7; - -// 24Kc: -// Data Line Size -// - https://s3-eu-west-1.amazonaws.com/downloads-mips/documents/MD00346-2B-24K-DTS-04.00.pdf -// - https://gitlab.e.foundation/e/devices/samsung/n7100/stable_android_kernel_samsung_smdk4412/commit/2dbac10263b2f3c561de68b4c369bc679352ccee -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "mips")] -pub const ALIGNMENT: usize = 1 << 5; -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "mips64")] -pub const ALIGNMENT: usize = 1 << 5; - -// Defaults for powerpc -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "powerpc")] -pub const ALIGNMENT: usize = 1 << 5; - -// Defaults for the ppc 64 -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "powerpc64")] -pub const ALIGNMENT: usize = 1 << 6; - -// e.g.: sifive -// - https://github.com/torvalds/linux/blob/master/Documentation/devicetree/bindings/riscv/sifive-l2-cache.txt#L41 -// in general all of them are the same. -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "riscv")] -pub const ALIGNMENT: usize = 1 << 6; - -// This size is same across all hardware for this architecture. -// - https://docs.huihoo.com/doxygen/linux/kernel/3.7/arch_2s390_2include_2asm_2cache_8h.html -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "s390x")] -pub const ALIGNMENT: usize = 1 << 8; - -// This size is same across all hardware for this architecture. -// - https://docs.huihoo.com/doxygen/linux/kernel/3.7/arch_2sparc_2include_2asm_2cache_8h.html#a9400cc2ba37e33279bdbc510a6311fb4 -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "sparc")] -pub const ALIGNMENT: usize = 1 << 5; -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "sparc64")] -pub const ALIGNMENT: usize = 1 << 6; - -// On ARM cache line sizes are fixed. both v6 and v7. -// Need to add board specific or platform specific things later. -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "thumbv6")] -pub const ALIGNMENT: usize = 1 << 5; -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "thumbv7")] -pub const ALIGNMENT: usize = 1 << 5; - -// Operating Systems cache size determines this. -// Currently no way to determine this without runtime inference. -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "wasm32")] -pub const ALIGNMENT: usize = FALLBACK_ALIGNMENT; - -// Same as v6 and v7. -// List goes like that: -// Cortex A, M, R, ARM v7, v7-M, Krait and NeoverseN uses this size. -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "arm")] -pub const ALIGNMENT: usize = 1 << 5; - -// Combined from 4 sectors. Volta says 128. -// Prevent chunk optimizations better to go to the default size. -// If you have smaller data with less padded functionality then use 32 with force option. -// - https://devtalk.nvidia.com/default/topic/803600/variable-cache-line-width-/ -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "nvptx")] -pub const ALIGNMENT: usize = 1 << 7; -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "nvptx64")] -pub const ALIGNMENT: usize = 1 << 7; - -// This size is same across all hardware for this architecture. -/// Cache and allocation multiple alignment size -#[cfg(target_arch = "aarch64")] -pub const ALIGNMENT: usize = 1 << 6; - -#[doc(hidden)] -/// Fallback cache and allocation multiple alignment size -const FALLBACK_ALIGNMENT: usize = 1 << 6; - -/// -/// As you can see this is global and lives as long as the program lives. -/// Be careful to not write anything to this pointer in any scenario. -/// If you use allocation methods shown here you won't have any problems. -const BYPASS_PTR: NonNull = unsafe { NonNull::new_unchecked(ALIGNMENT as *mut u8) }; - -// If this number is not zero after all objects have been `drop`, there is a memory leak -pub static mut ALLOCATIONS: AtomicIsize = AtomicIsize::new(0); - -/// Allocates a cache-aligned memory region of `size` bytes with uninitialized values. -/// This is more performant than using [allocate_aligned_zeroed] when all bytes will have -/// an unknown or non-zero value and is semantically similar to `malloc`. -pub fn allocate_aligned(size: usize) -> NonNull { - unsafe { - if size == 0 { - // In a perfect world, there is no need to request zero size allocation. - // Currently, passing zero sized layout to alloc is UB. - // This will dodge allocator api for any type. - BYPASS_PTR - } else { - ALLOCATIONS.fetch_add(size as isize, std::sync::atomic::Ordering::SeqCst); - - let layout = Layout::from_size_align_unchecked(size, ALIGNMENT); - let raw_ptr = std::alloc::alloc(layout); - NonNull::new(raw_ptr).unwrap_or_else(|| handle_alloc_error(layout)) - } - } -} - -/// Allocates a cache-aligned memory region of `size` bytes with `0u8` on all of them. -/// This is more performant than using [allocate_aligned] and setting all bytes to zero -/// and is semantically similar to `calloc`. -pub fn allocate_aligned_zeroed(size: usize) -> NonNull { - unsafe { - if size == 0 { - // In a perfect world, there is no need to request zero size allocation. - // Currently, passing zero sized layout to alloc is UB. - // This will dodge allocator api for any type. - BYPASS_PTR - } else { - ALLOCATIONS.fetch_add(size as isize, std::sync::atomic::Ordering::SeqCst); - - let layout = Layout::from_size_align_unchecked(size, ALIGNMENT); - let raw_ptr = std::alloc::alloc_zeroed(layout); - NonNull::new(raw_ptr).unwrap_or_else(|| handle_alloc_error(layout)) - } - } -} - -/// # Safety -/// -/// This function is unsafe because undefined behavior can result if the caller does not ensure all -/// of the following: -/// -/// * ptr must denote a block of memory currently allocated via this allocator, -/// -/// * size must be the same size that was used to allocate that block of memory, -pub unsafe fn free_aligned(ptr: NonNull, size: usize) { - if ptr != BYPASS_PTR { - ALLOCATIONS.fetch_sub(size as isize, std::sync::atomic::Ordering::SeqCst); - std::alloc::dealloc( - ptr.as_ptr(), - Layout::from_size_align_unchecked(size, ALIGNMENT), - ); - } -} - -/// # Safety -/// -/// This function is unsafe because undefined behavior can result if the caller does not ensure all -/// of the following: -/// -/// * ptr must be currently allocated via this allocator, -/// -/// * new_size must be greater than zero. -/// -/// * new_size, when rounded up to the nearest multiple of [ALIGNMENT], must not overflow (i.e., -/// the rounded value must be less than usize::MAX). -pub unsafe fn reallocate( - ptr: NonNull, - old_size: usize, - new_size: usize, -) -> NonNull { - if ptr == BYPASS_PTR { - return allocate_aligned(new_size); - } - - if new_size == 0 { - free_aligned(ptr, old_size); - return BYPASS_PTR; - } - - ALLOCATIONS.fetch_add( - new_size as isize - old_size as isize, - std::sync::atomic::Ordering::SeqCst, - ); - let raw_ptr = std::alloc::realloc( - ptr.as_ptr(), - Layout::from_size_align_unchecked(old_size, ALIGNMENT), - new_size, - ); - NonNull::new(raw_ptr).unwrap_or_else(|| { - handle_alloc_error(Layout::from_size_align_unchecked(new_size, ALIGNMENT)) - }) -} - -/// # Safety -/// -/// Behavior is undefined if any of the following conditions are violated: -/// -/// * `src` must be valid for reads of `len * size_of::()` bytes. -/// -/// * `dst` must be valid for writes of `len * size_of::()` bytes. -/// -/// * Both `src` and `dst` must be properly aligned. -/// -/// `memcpy` creates a bitwise copy of `T`, regardless of whether `T` is [`Copy`]. If `T` is not -/// [`Copy`], using both the values in the region beginning at `*src` and the region beginning at -/// `*dst` can [violate memory safety][read-ownership]. -pub unsafe fn memcpy(dst: NonNull, src: NonNull, count: usize) { - if src != BYPASS_PTR { - std::ptr::copy_nonoverlapping(src.as_ptr(), dst.as_ptr(), count) - } -} - -pub fn is_ptr_aligned(p: NonNull) -> bool { - p.as_ptr().align_offset(align_of::()) == 0 -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_allocate() { - for _ in 0..10 { - let p = allocate_aligned(1024); - // make sure this is 64-byte aligned - assert_eq!(0, (p.as_ptr() as usize) % 64); - unsafe { free_aligned(p, 1024) }; - } - } -}