Skip to content

Commit 1be76ea

Browse files
authored
perf: added an optimized impl of PartialEq to Mv (#1400)
Another small improvement I noticed while comparing perf to `dav1d`. This improves the performance of `add_temporal_candidate` by ~20% (on M3), so it now matches the one from `dav1d` (to less than 0.1% diff), as measured when decoding Chimera-AV1-8bit-1920x1080-6736kbps.ivf. The overall improvement is about 0.5%. Edit: also for `RefMvs{Mv,Ref}Pair`, which seems to have a smaller effect.
2 parents 3e104b9 + 3db6886 commit 1be76ea

File tree

2 files changed

+38
-3
lines changed

2 files changed

+38
-3
lines changed

src/levels.rs

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -354,7 +354,7 @@ pub enum InterIntraType {
354354
/// which includes `&self` methods, including autogenerated ones like [`PartialEq::eq`].
355355
///
356356
/// [`refmvs_block`]: crate::src::refmvs::refmvs_block
357-
#[derive(Clone, Copy, PartialEq, Eq, Default, FromZeroes, FromBytes, AsBytes)]
357+
#[derive(Clone, Copy, Eq, Default, FromZeroes, FromBytes, AsBytes)]
358358
#[repr(C)]
359359
pub struct Mv {
360360
pub y: i16,
@@ -390,6 +390,17 @@ impl Neg for Mv {
390390
}
391391
}
392392

393+
impl PartialEq for Mv {
394+
#[inline(always)]
395+
fn eq(&self, other: &Self) -> bool {
396+
// `#[derive(PartialEq)]` compares per-field with `&&`,
397+
// which isn't optimized well and isn't coalesced into wider loads.
398+
// Comparing all of the bytes at once optimizes better with wider loads.
399+
// See <https://github.com/rust-lang/rust/issues/140167>.
400+
self.as_bytes() == other.as_bytes()
401+
}
402+
}
403+
393404
#[derive(Debug, Clone, Copy, PartialEq, Eq, FromRepr, Default)]
394405
pub enum MotionMode {
395406
#[default]

src/refmvs.rs

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ use std::mem;
2828
use std::mem::MaybeUninit;
2929
use std::ptr;
3030
use std::slice;
31+
use zerocopy::AsBytes;
3132
use zerocopy::FromZeroes;
3233

3334
#[derive(Clone, Copy, Default, PartialEq, Eq)]
@@ -38,7 +39,7 @@ pub struct RefMvsTemporalBlock {
3839
}
3940
const _: () = assert!(mem::size_of::<RefMvsTemporalBlock>() == 5);
4041

41-
#[derive(Clone, Copy, PartialEq, Eq, FromZeroes)]
42+
#[derive(Clone, Copy, Eq, FromZeroes, AsBytes)]
4243
// In C, this is packed and is 2 bytes.
4344
// In Rust, being packed and aligned is tricky
4445
#[repr(C, align(2))]
@@ -47,19 +48,42 @@ pub struct RefMvsRefPair {
4748
}
4849
const _: () = assert!(mem::size_of::<RefMvsRefPair>() == 2);
4950

51+
impl PartialEq for RefMvsRefPair {
52+
#[inline(always)]
53+
fn eq(&self, other: &Self) -> bool {
54+
// `#[derive(PartialEq)]` compares per-field with `&&`,
55+
// which isn't optimized well and isn't coalesced into wider loads.
56+
// Comparing all of the bytes at once optimizes better with wider loads.
57+
// See <https://github.com/rust-lang/rust/issues/140167>.
58+
self.as_bytes() == other.as_bytes()
59+
}
60+
}
61+
5062
impl From<[i8; 2]> for RefMvsRefPair {
5163
fn from(from: [i8; 2]) -> Self {
5264
RefMvsRefPair { r#ref: from }
5365
}
5466
}
5567

56-
#[derive(Clone, Copy, Default, PartialEq, Eq, FromZeroes)]
68+
#[derive(Clone, Copy, Default, Eq, FromZeroes, AsBytes)]
5769
#[repr(C)]
70+
#[repr(align(4))] // Is a `union` with a `uint64_t` in C, so `align(8)`, but `align(8)` doesn't allow `align(4)` for `RefMvsBlock`.
5871
pub struct RefMvsMvPair {
5972
pub mv: [Mv; 2],
6073
}
6174
const _: () = assert!(mem::size_of::<RefMvsMvPair>() == 8);
6275

76+
impl PartialEq for RefMvsMvPair {
77+
#[inline(always)]
78+
fn eq(&self, other: &Self) -> bool {
79+
// `#[derive(PartialEq)]` compares per-field with `&&`,
80+
// which isn't optimized well and isn't coalesced into wider loads.
81+
// Comparing all of the bytes at once optimizes better with wider loads.
82+
// See <https://github.com/rust-lang/rust/issues/140167>.
83+
self.as_bytes() == other.as_bytes()
84+
}
85+
}
86+
6387
#[derive(Clone, Copy, FromZeroes)]
6488
// In C, this is packed and is 12 bytes.
6589
// In Rust, being packed and aligned is tricky

0 commit comments

Comments
 (0)