-
Notifications
You must be signed in to change notification settings - Fork 4k
ARROW-5357: [Rust] Change Buffer::len to represent total bytes instead of used bytes #4331
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -53,18 +53,28 @@ struct BufferData { | |
| /// The raw pointer into the buffer bytes | ||
| ptr: *const u8, | ||
|
|
||
| /// The length (num of bytes) of the buffer | ||
| /// The length (num of bytes) of the buffer. The region `[0, len)` of the buffer | ||
| /// is occupied with meaningful data, while the rest `[len, capacity)` is the | ||
| /// unoccupied region. | ||
| len: usize, | ||
|
|
||
| /// Whether this piece of memory is owned by this object | ||
| owned: bool, | ||
|
|
||
| /// The capacity (num of bytes) of the buffer | ||
| /// Invariant: len <= capacity | ||
| capacity: usize, | ||
| } | ||
|
|
||
| impl PartialEq for BufferData { | ||
| fn eq(&self, other: &BufferData) -> bool { | ||
| if self.len != other.len { | ||
| return false; | ||
| } | ||
| if self.capacity != other.capacity { | ||
| return false; | ||
| } | ||
|
|
||
| unsafe { memory::memcmp(self.ptr, other.ptr, self.len) == 0 } | ||
| } | ||
| } | ||
|
|
@@ -73,7 +83,7 @@ impl PartialEq for BufferData { | |
| impl Drop for BufferData { | ||
| fn drop(&mut self) { | ||
| if !self.ptr.is_null() && self.owned { | ||
| memory::free_aligned(self.ptr as *mut u8, self.len); | ||
| memory::free_aligned(self.ptr as *mut u8, self.capacity); | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -82,8 +92,8 @@ impl Debug for BufferData { | |
| fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { | ||
| write!( | ||
| f, | ||
| "BufferData {{ ptr: {:?}, len: {}, data: ", | ||
| self.ptr, self.len | ||
| "BufferData {{ ptr: {:?}, len: {}, capacity: {}, data: ", | ||
| self.ptr, self.len, self.capacity | ||
| )?; | ||
|
|
||
| unsafe { | ||
|
|
@@ -104,13 +114,14 @@ impl Buffer { | |
| /// | ||
| /// * `ptr` - Pointer to raw parts | ||
| /// * `len` - Length of raw parts in **bytes** | ||
| /// * `capacity` - Total allocated memory for the pointer `ptr`, in **bytes** | ||
| /// | ||
| /// # Safety | ||
| /// | ||
| /// This function is unsafe as there is no guarantee that the given pointer is valid for `len` | ||
| /// bytes. | ||
| pub unsafe fn from_raw_parts(ptr: *const u8, len: usize) -> Self { | ||
| Buffer::build_with_arguments(ptr, len, true) | ||
| pub unsafe fn from_raw_parts(ptr: *const u8, len: usize, capacity: usize) -> Self { | ||
| Buffer::build_with_arguments(ptr, len, capacity, true) | ||
| } | ||
|
|
||
| /// Creates a buffer from an existing memory region (must already be byte-aligned), this | ||
|
|
@@ -120,13 +131,14 @@ impl Buffer { | |
| /// | ||
| /// * `ptr` - Pointer to raw parts | ||
| /// * `len` - Length of raw parts in **bytes** | ||
| /// * `capacity` - Total allocated memory for the pointer `ptr`, in **bytes** | ||
| /// | ||
| /// # Safety | ||
| /// | ||
| /// This function is unsafe as there is no guarantee that the given pointer is valid for `len` | ||
| /// bytes. | ||
| pub unsafe fn from_unowned(ptr: *const u8, len: usize) -> Self { | ||
| Buffer::build_with_arguments(ptr, len, false) | ||
| pub unsafe fn from_unowned(ptr: *const u8, len: usize, capacity: usize) -> Self { | ||
| Buffer::build_with_arguments(ptr, len, capacity, false) | ||
| } | ||
|
|
||
| /// Creates a buffer from an existing memory region (must already be byte-aligned). | ||
|
|
@@ -135,19 +147,30 @@ impl Buffer { | |
| /// | ||
| /// * `ptr` - Pointer to raw parts | ||
| /// * `len` - Length of raw parts in bytes | ||
| /// * `capacity` - Total allocated memory for the pointer `ptr`, in **bytes** | ||
| /// * `owned` - Whether the raw parts is owned by this `Buffer`. If true, this `Buffer` will | ||
| /// free this memory when dropped, otherwise it will skip freeing the raw parts. | ||
| /// | ||
| /// # Safety | ||
| /// | ||
| /// This function is unsafe as there is no guarantee that the given pointer is valid for `len` | ||
| /// bytes. | ||
| unsafe fn build_with_arguments(ptr: *const u8, len: usize, owned: bool) -> Self { | ||
| unsafe fn build_with_arguments( | ||
| ptr: *const u8, | ||
| len: usize, | ||
| capacity: usize, | ||
| owned: bool, | ||
| ) -> Self { | ||
| assert!( | ||
| memory::is_aligned(ptr, memory::ALIGNMENT), | ||
| "memory not aligned" | ||
| ); | ||
|
||
| let buf_data = BufferData { ptr, len, owned }; | ||
| let buf_data = BufferData { | ||
| ptr, | ||
| len, | ||
| capacity, | ||
| owned, | ||
| }; | ||
| Buffer { | ||
| data: Arc::new(buf_data), | ||
| offset: 0, | ||
|
|
@@ -159,6 +182,11 @@ impl Buffer { | |
| self.data.len - self.offset | ||
| } | ||
|
|
||
| /// Returns the capacity of this buffer | ||
| pub fn capacity(&self) -> usize { | ||
| self.data.capacity | ||
| } | ||
|
|
||
| /// Returns whether the buffer is empty. | ||
| pub fn is_empty(&self) -> bool { | ||
| self.data.len - self.offset == 0 | ||
|
|
@@ -210,7 +238,7 @@ impl Buffer { | |
|
|
||
| /// Returns an empty buffer. | ||
| pub fn empty() -> Self { | ||
| unsafe { Self::from_raw_parts(::std::ptr::null(), 0) } | ||
| unsafe { Self::from_raw_parts(::std::ptr::null(), 0, 0) } | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -234,7 +262,7 @@ impl<T: AsRef<[u8]>> From<T> for Buffer { | |
| let buffer = memory::allocate_aligned(capacity); | ||
| unsafe { | ||
| memory::memcpy(buffer, slice.as_ptr(), len); | ||
| Buffer::from_raw_parts(buffer, len) | ||
| Buffer::from_raw_parts(buffer, len, capacity) | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -504,6 +532,7 @@ impl MutableBuffer { | |
| let buffer_data = BufferData { | ||
| ptr: self.data, | ||
| len: self.len, | ||
| capacity: self.capacity, | ||
| owned: true, | ||
| }; | ||
| std::mem::forget(self); | ||
|
|
@@ -527,6 +556,9 @@ impl PartialEq for MutableBuffer { | |
| if self.len != other.len { | ||
| return false; | ||
| } | ||
| if self.capacity != other.capacity { | ||
|
||
| return false; | ||
| } | ||
| unsafe { memory::memcmp(self.data, other.data, self.len) == 0 } | ||
| } | ||
| } | ||
|
|
@@ -584,45 +616,47 @@ mod tests { | |
|
|
||
| #[test] | ||
| fn test_from_raw_parts() { | ||
| let buf = unsafe { Buffer::from_raw_parts(null_mut(), 0) }; | ||
| let buf = unsafe { Buffer::from_raw_parts(null_mut(), 0, 0) }; | ||
| assert_eq!(0, buf.len()); | ||
| assert_eq!(0, buf.data().len()); | ||
| assert_eq!(0, buf.capacity()); | ||
| assert!(buf.raw_data().is_null()); | ||
|
|
||
| let buf = Buffer::from(&[0, 1, 2, 3, 4]); | ||
| assert_eq!(5, buf.len()); | ||
| assert!(!buf.raw_data().is_null()); | ||
| assert_eq!(&[0, 1, 2, 3, 4], buf.data()); | ||
| assert_eq!([0, 1, 2, 3, 4], buf.data()); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_from_vec() { | ||
| let buf = Buffer::from(&[0, 1, 2, 3, 4]); | ||
| assert_eq!(5, buf.len()); | ||
| assert!(!buf.raw_data().is_null()); | ||
| assert_eq!(&[0, 1, 2, 3, 4], buf.data()); | ||
| assert_eq!([0, 1, 2, 3, 4], buf.data()); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_copy() { | ||
| let buf = Buffer::from(&[0, 1, 2, 3, 4]); | ||
| let buf2 = buf.clone(); | ||
| assert_eq!(5, buf2.len()); | ||
| assert_eq!(64, buf2.capacity()); | ||
| assert!(!buf2.raw_data().is_null()); | ||
| assert_eq!(&[0, 1, 2, 3, 4], buf2.data()); | ||
| assert_eq!([0, 1, 2, 3, 4], buf2.data()); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_slice() { | ||
| let buf = Buffer::from(&[2, 4, 6, 8, 10]); | ||
| let buf2 = buf.slice(2); | ||
|
|
||
| assert_eq!(&[6, 8, 10], buf2.data()); | ||
| assert_eq!([6, 8, 10], buf2.data()); | ||
| assert_eq!(3, buf2.len()); | ||
| assert_eq!(unsafe { buf.raw_data().offset(2) }, buf2.raw_data()); | ||
|
|
||
| let buf3 = buf2.slice(1); | ||
| assert_eq!(&[8, 10], buf3.data()); | ||
| assert_eq!([8, 10], buf3.data()); | ||
| assert_eq!(2, buf3.len()); | ||
| assert_eq!(unsafe { buf.raw_data().offset(3) }, buf3.raw_data()); | ||
|
|
||
|
|
@@ -778,18 +812,38 @@ mod tests { | |
| buf.write("aaaa bbbb cccc dddd".as_bytes()) | ||
| .expect("write should be OK"); | ||
| assert_eq!(19, buf.len()); | ||
| assert_eq!(64, buf.capacity()); | ||
| assert_eq!("aaaa bbbb cccc dddd".as_bytes(), buf.data()); | ||
|
|
||
| let immutable_buf = buf.freeze(); | ||
| assert_eq!(19, immutable_buf.len()); | ||
| assert_eq!(64, immutable_buf.capacity()); | ||
| assert_eq!("aaaa bbbb cccc dddd".as_bytes(), immutable_buf.data()); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_mutable_equal() -> Result<()> { | ||
| let mut buf = MutableBuffer::new(1); | ||
| let mut buf2 = MutableBuffer::new(1); | ||
|
|
||
| buf.write(&[0xaa])?; | ||
| buf2.write(&[0xaa, 0xbb])?; | ||
| assert!(buf != buf2); | ||
|
|
||
| buf.write(&[0xbb])?; | ||
| assert_eq!(buf, buf2); | ||
|
|
||
| buf2.reserve(65)?; | ||
| assert!(buf != buf2); | ||
|
|
||
| Ok(()) | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_access_concurrently() { | ||
| let buffer = Buffer::from(vec![1, 2, 3, 4, 5]); | ||
| let buffer2 = buffer.clone(); | ||
| assert_eq!(&[1, 2, 3, 4, 5], buffer.data()); | ||
| assert_eq!([1, 2, 3, 4, 5], buffer.data()); | ||
|
|
||
| let buffer_copy = thread::spawn(move || { | ||
| // access buffer in another thread. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not against the current implementation, but I wonder if we should only compare the "meaningful" data?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree. It seems wrong that on one hand the memcmp is comparing up to
lenand we bypass that if there is a mismatch oncapacityThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks guys! I'm slightly confused - here we are comparing meaningful data, no? and if capacity mismatch then it is considered not equal and we skip comparing the data content.
The equality is defined as: 1) both length should be equal, 2) both capacity should be equal, and 3) data content up to length should be equal. Let me know if this definition sounds good to you.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To me if two arrays only differ by the amount of padding that they have then I would consider them equal. When I perform operations using these two arrays I will get the same answer (because the padding, or rather amount of padding, does not impact the result). However:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Array equality is defined separately in
array/equal.rsand yes, it does take account on what you said (it compares buffer content withdata()). In the current context we are discussing equality of buffers though, which IMO, when looking at in isolation, should consider bothlenandcapacity.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, yes I agree.