- 
                Notifications
    You must be signed in to change notification settings 
- Fork 13.9k
Description
i'd noticed the pclmulqdq intrinsics in crc32fast were notable in a perf report of a benchmark last night. somewhat shockingly, there were functions whose body was pclmulqdq xmm0, xmm1, 17; ret and pclmulqdq xmm0, xmm1, 0; ret, complete with constraining callers' choice of xmm registers! after a bit of digging it seems to be a regression in nightly.
the specific regression i'd started at can be reproduced with cargo bench in https://github.com/srijs/rust-crc32fast .
cargo +1.85.1 bench produces
running 4 tests
test bench_kilobyte_baseline    ... bench:         130 ns/iter (+/- 2) = 7876 MB/s
test bench_kilobyte_specialized ... bench:          48 ns/iter (+/- 0) = 21333 MB/s
test bench_megabyte_baseline    ... bench:     137,017 ns/iter (+/- 247) = 7652 MB/s
test bench_megabyte_specialized ... bench:      48,153 ns/iter (+/- 51) = 21775 MB/s
whereas
cargo +nightly bench produces
running 4 tests
test bench_kilobyte_baseline    ... bench:         130 ns/iter (+/- 1) = 7876 MB/s
test bench_kilobyte_specialized ... bench:         156 ns/iter (+/- 0) = 6564 MB/s
test bench_megabyte_baseline    ... bench:     137,229 ns/iter (+/- 393) = 7641 MB/s
test bench_megabyte_specialized ... bench:     145,632 ns/iter (+/- 377) = 7200 MB/s
after looking at perf a bit i believe this is representative: https://rust.godbolt.org/z/8dxcE4vo1 . i'm including everything there in this issue as well.
Code
I tried this code:
use std::arch::x86_64 as arch;
#[target_feature(enable = "pclmulqdq", enable = "sse2", enable = "sse4.1")]
#[no_mangle]
pub unsafe fn reduce128_caller(a: arch::__m128i, b: arch::__m128i, keys: arch::__m128i) -> arch::__m128i {
    reduce128(a, b, keys)
}
unsafe fn reduce128(a: arch::__m128i, b: arch::__m128i, keys: arch::__m128i) -> arch::__m128i {
    let t1 = arch::_mm_clmulepi64_si128(a, keys, 0x00);
    let t2 = arch::_mm_clmulepi64_si128(a, keys, 0x11);
    arch::_mm_xor_si128(arch::_mm_xor_si128(b, t1), t2)
}I expected to see this happen (with -C opt-level=3):
reduce128_caller:
        mov     rax, rdi
        movdqa  xmm0, xmmword ptr [rsi]
        movdqa  xmm1, xmmword ptr [rcx]
        movdqa  xmm2, xmm0
        pclmulqdq       xmm2, xmm1, 0
        pxor    xmm2, xmmword ptr [rdx]
        pclmulqdq       xmm0, xmm1, 17
        pxor    xmm2, xmm0
        movdqa  xmmword ptr [rdi], xmm2
        ret
Instead, this happened (also -C opt-level=3):
core::core_arch::x86::pclmulqdq::_mm_clmulepi64_si128::h9ea1fa421d47acc5:
        pclmulqdq       xmm0, xmm1, 17
        ret
core::core_arch::x86::pclmulqdq::_mm_clmulepi64_si128::heb2402630e2a6f04:
        pclmulqdq       xmm0, xmm1, 0
        ret
reduce128_caller:
        jmp     example::reduce128::h8b5076bc8edc1d53
example::reduce128::h8b5076bc8edc1d53:
        sub     rsp, 72
        movaps  xmmword ptr [rsp + 16], xmm2
        movaps  xmmword ptr [rsp + 48], xmm1
        movaps  xmmword ptr [rsp], xmm0
        movaps  xmm1, xmm2
        call    core::core_arch::x86::pclmulqdq::_mm_clmulepi64_si128::heb2402630e2a6f04
        movaps  xmmword ptr [rsp + 32], xmm0
        movaps  xmm0, xmmword ptr [rsp]
        movaps  xmm1, xmmword ptr [rsp + 16]
        call    core::core_arch::x86::pclmulqdq::_mm_clmulepi64_si128::h9ea1fa421d47acc5
        movaps  xmm1, xmmword ptr [rsp + 32]
        xorps   xmm1, xmmword ptr [rsp + 48]
        xorps   xmm0, xmm1
        add     rsp, 72
        ret
Version it worked on
1.85.1, 1.31.0, and a half dozen in between.
additionally, beta (rust version 1.86.0-beta.7 (7824ede 2025-03-22) seems good.
nightly with -C opt-level=3 -C target-feature=+pclmul still does great.
Version with regression
in the above godbolt link, i see --version in the rustc nightly tab provides rustc 1.87.0-nightly (a2e63569f 2025-03-26). this is consistent with how i first saw this locally:
rustc +nightly --version --verbose:
rustc +nightly --version --verbose
rustc 1.87.0-nightly (a2e63569f 2025-03-26)
binary: rustc
commit-hash: a2e63569fd6702ac5dd027a80a9fdaadce73adae
commit-date: 2025-03-26
host: x86_64-unknown-linux-gnu
release: 1.87.0-nightly
LLVM version: 20.1.1
Related improvement along the way
adding the same target_feature block on the inner function sees nightly produce somewhat better-than-baseline code: https://rust.godbolt.org/z/sGrYedeaP
use std::arch::x86_64 as arch;
#[target_feature(enable = "pclmulqdq", enable = "sse2", enable = "sse4.1")]
#[no_mangle]
pub unsafe fn reduce128_caller(a: arch::__m128i, b: arch::__m128i, keys: arch::__m128i) -> arch::__m128i {
    reduce128(a, b, keys)
}
#[target_feature(enable = "pclmulqdq", enable = "sse2", enable = "sse4.1")]
unsafe fn reduce128(a: arch::__m128i, b: arch::__m128i, keys: arch::__m128i) -> arch::__m128i {
    let t1 = arch::_mm_clmulepi64_si128(a, keys, 0x00);
    let t2 = arch::_mm_clmulepi64_si128(a, keys, 0x11);
    arch::_mm_xor_si128(arch::_mm_xor_si128(b, t1), t2)
}with rustc +nightly -C opt-level 3 yields:
reduce128_caller:
        movdqa  xmm3, xmm0
        pclmulqdq       xmm3, xmm2, 0
        pclmulqdq       xmm0, xmm2, 17
        pxor    xmm3, xmm1
        pxor    xmm0, xmm3
        ret
whereas before the codegen was identical regardless of the target_feature attribute on the inner function. so at least in some cases there is a modest improvement?
@rustbot modify labels: +regression-from-stable-to-nightly -regression-untriaged
LLVM upstream issue: llvm/llvm-project#142321