Skip to content

Commit

Permalink
Merge pull request #92 from ytmimi/fix-generic-simd
Browse files Browse the repository at this point in the history
remove `packed_simd` in favor of `std::simd` and `#![feature(portable_simd)]`
  • Loading branch information
llogiq authored Apr 17, 2024
2 parents 934ea0e + 32a098d commit 68ecb39
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 17 deletions.
3 changes: 1 addition & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,11 @@ appveyor = { repository = "llogiq/bytecount" }
bench = false

[features]
generic-simd = ["packed_simd"]
generic-simd = []
runtime-dispatch-simd = []
html_report = []

[dependencies]
packed_simd = { version = "0.3.8", optional = true }

[dev-dependencies]
quickcheck = "1.0"
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ Your users can then compile with runtime dispatch using:
cargo build --release --features runtime-dispatch-simd
```

The second, `generic-simd`, uses `packed_simd` to provide a fast
The second, `generic-simd`, uses [`std::simd`](https://doc.rust-lang.org/std/simd/index.html) and [`#![feature(portable_simd)]`](https://github.com/rust-lang/rust/issues/86656) to provide a fast
architecture-agnostic SIMD codepath, but requires running on nightly.

Your users can compile with this codepath using:
Expand Down
2 changes: 2 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
//! [`naive_count_32`](fn.naive_count_32.html) method can be faster
//! still on small strings.
#![cfg_attr(feature = "generic-simd", feature(portable_simd))]

#![deny(missing_docs)]
#![cfg_attr(not(feature = "runtime-dispatch-simd"), no_std)]

Expand Down
29 changes: 15 additions & 14 deletions src/simd/generic.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
extern crate packed_simd;


#[cfg(not(feature = "runtime-dispatch-simd"))]
use core::mem;
use core::{mem, simd};

#[cfg(feature = "runtime-dispatch-simd")]
use std::mem;
use std::{mem, simd};

use self::packed_simd::{u8x32, u8x64, FromCast};
use simd::{u8x32, u8x64, cmp::SimdPartialEq, num::SimdInt};

const MASK: [u8; 64] = [
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Expand All @@ -14,20 +15,20 @@ const MASK: [u8; 64] = [
];

unsafe fn u8x64_from_offset(slice: &[u8], offset: usize) -> u8x64 {
u8x64::from_slice_unaligned_unchecked(slice.get_unchecked(offset..))
u8x64::from_slice(slice.get_unchecked(offset..))
}
unsafe fn u8x32_from_offset(slice: &[u8], offset: usize) -> u8x32 {
u8x32::from_slice_unaligned_unchecked(slice.get_unchecked(offset..))
u8x32::from_slice(slice.get_unchecked(offset..))
}

fn sum_x64(u8s: &u8x64) -> usize {
let mut store = [0; mem::size_of::<u8x64>()];
u8s.write_to_slice_unaligned(&mut store);
u8s.copy_to_slice(&mut store);
store.iter().map(|&e| e as usize).sum()
}
fn sum_x32(u8s: &u8x32) -> usize {
let mut store = [0; mem::size_of::<u8x32>()];
u8s.write_to_slice_unaligned(&mut store);
u8s.copy_to_slice(&mut store);
store.iter().map(|&e| e as usize).sum()
}

Expand All @@ -44,7 +45,7 @@ pub fn chunk_count(haystack: &[u8], needle: u8) -> usize {
while haystack.len() >= offset + 64 * 255 {
let mut counts = u8x64::splat(0);
for _ in 0..255 {
counts -= u8x64::from_cast(u8x64_from_offset(haystack, offset).eq(needles_x64));
counts -= u8x64_from_offset(haystack, offset).simd_eq(needles_x64).to_int().cast();
offset += 64;
}
count += sum_x64(&counts);
Expand All @@ -54,7 +55,7 @@ pub fn chunk_count(haystack: &[u8], needle: u8) -> usize {
if haystack.len() >= offset + 64 * 128 {
let mut counts = u8x64::splat(0);
for _ in 0..128 {
counts -= u8x64::from_cast(u8x64_from_offset(haystack, offset).eq(needles_x64));
counts -= u8x64_from_offset(haystack, offset).simd_eq(needles_x64).to_int().cast();
offset += 64;
}
count += sum_x64(&counts);
Expand All @@ -66,15 +67,15 @@ pub fn chunk_count(haystack: &[u8], needle: u8) -> usize {
let mut counts = u8x32::splat(0);
for i in 0..(haystack.len() - offset) / 32 {
counts -=
u8x32::from_cast(u8x32_from_offset(haystack, offset + i * 32).eq(needles_x32));
u8x32_from_offset(haystack, offset + i * 32).simd_eq(needles_x32).to_int().cast();
}
count += sum_x32(&counts);

// Straggler; need to reset counts because prior loop can run 255 times
counts = u8x32::splat(0);
if haystack.len() % 32 != 0 {
counts -=
u8x32::from_cast(u8x32_from_offset(haystack, haystack.len() - 32).eq(needles_x32))
u8x32_from_offset(haystack, haystack.len() - 32).simd_eq(needles_x32).to_int().cast()
& u8x32_from_offset(&MASK, haystack.len() % 32);
}
count += sum_x32(&counts);
Expand All @@ -84,11 +85,11 @@ pub fn chunk_count(haystack: &[u8], needle: u8) -> usize {
}

fn is_leading_utf8_byte_x64(u8s: u8x64) -> u8x64 {
u8x64::from_cast((u8s & u8x64::splat(0b1100_0000)).ne(u8x64::splat(0b1000_0000)))
(u8s & u8x64::splat(0b1100_0000)).simd_ne(u8x64::splat(0b1000_0000)).to_int().cast()
}

fn is_leading_utf8_byte_x32(u8s: u8x32) -> u8x32 {
u8x32::from_cast((u8s & u8x32::splat(0b1100_0000)).ne(u8x32::splat(0b1000_0000)))
(u8s & u8x32::splat(0b1100_0000)).simd_ne(u8x32::splat(0b1000_0000)).to_int().cast()
}

pub fn chunk_num_chars(utf8_chars: &[u8]) -> usize {
Expand Down

0 comments on commit 68ecb39

Please sign in to comment.