Skip to content

Commit

Permalink
fix(deflate): further deflate performance improvements especially on …
Browse files Browse the repository at this point in the history
…fast mode
  • Loading branch information
oyvindln committed Mar 6, 2025
1 parent d647cb6 commit 5a65104
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 38 deletions.
2 changes: 1 addition & 1 deletion build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ sed -i "s/$OLD/$NEW/g" Cargo.toml
rm -f libminiz_oxide_c_api.a

if [[ ($# == 0 || $1 == "--release" ) ]]; then
RUSTFLAGS="-g -C debug-assertions" cargo build --release --features=miniz_zip -- || exit 1
RUSTFLAGS="-g -C debug-assertions" cargo build --release --features=miniz_zip,simd -- || exit 1
cp target/release/libminiz_oxide_c_api.a .
elif [[ $1 == "--debug" ]]; then
cargo build --features=miniz_zip || exit 1
Expand Down
5 changes: 3 additions & 2 deletions miniz_oxide/src/deflate/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use alloc::vec;

/// Size of the buffer of lz77 encoded data.
pub const LZ_CODE_BUF_SIZE: usize = 64 * 1024;
pub const LZ_CODE_BUF_MASK: usize = LZ_CODE_BUF_SIZE - 1;
/// Size of the output buffer.
pub const OUT_BUF_SIZE: usize = (LZ_CODE_BUF_SIZE * 13) / 10;
pub const LZ_DICT_FULL_SIZE: usize = LZ_DICT_SIZE + MAX_MATCH_LEN - 1 + 1;
Expand All @@ -20,8 +21,8 @@ pub const LZ_HASH_SHIFT: i32 = (LZ_HASH_BITS + 2) / 3;
pub const LZ_HASH_SIZE: usize = 1 << LZ_HASH_BITS;

#[inline]
pub fn update_hash(current_hash: u16, byte: u8) -> u16 {
((current_hash << LZ_HASH_SHIFT) ^ u16::from(byte)) & (LZ_HASH_SIZE as u16 - 1)
pub const fn update_hash(current_hash: u16, byte: u8) -> u16 {
((current_hash << LZ_HASH_SHIFT) ^ byte as u16) & (LZ_HASH_SIZE as u16 - 1)
}

pub struct HashBuffers {
Expand Down
79 changes: 44 additions & 35 deletions miniz_oxide/src/deflate/core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ use super::super::*;
use super::deflate_flags::*;
use super::CompressionLevel;
use crate::deflate::buffer::{
update_hash, HashBuffers, LocalBuf, LZ_CODE_BUF_SIZE, LZ_DICT_FULL_SIZE, LZ_HASH_BITS,
LZ_HASH_SHIFT, LZ_HASH_SIZE, OUT_BUF_SIZE,
update_hash, HashBuffers, LocalBuf, LZ_CODE_BUF_MASK, LZ_CODE_BUF_SIZE, LZ_DICT_FULL_SIZE,
LZ_HASH_BITS, LZ_HASH_SHIFT, LZ_HASH_SIZE, OUT_BUF_SIZE,
};
use crate::deflate::stored::compress_stored;
use crate::deflate::zlib;
Expand All @@ -25,27 +25,27 @@ pub(crate) const MAX_PROBES_MASK: u32 = 0xFFF;

const MAX_SUPPORTED_HUFF_CODESIZE: usize = 15;

/// Length code for length values.
#[rustfmt::skip]
const LEN_SYM: [u16; 256] = [
257, 258, 259, 260, 261, 262, 263, 264, 265, 265, 266, 266, 267, 267, 268, 268,
269, 269, 269, 269, 270, 270, 270, 270, 271, 271, 271, 271, 272, 272, 272, 272,
273, 273, 273, 273, 273, 273, 273, 273, 274, 274, 274, 274, 274, 274, 274, 274,
275, 275, 275, 275, 275, 275, 275, 275, 276, 276, 276, 276, 276, 276, 276, 276,
277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277,
278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278,
279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279,
280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280,
281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281,
281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281,
282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282,
282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282,
283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283,
283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283,
284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284,
284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 285
// Length code for length values - 256.
// We use an offset to help with bound check avoidance as we can mask values to 32
// and it also saves some memory as we can use a u8 instead of a u16.
// Conventiently our table is large enough that we can get away with using an
// offset of 256 which results in very efficient code.
const LEN_SYM: [u8; 256] = [
1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15,
15, 15, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19,
19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23,
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27,
27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 29,
];

const LEN_SYM_OFFSET: usize = 256;

/// Number of extra bits for length values.
#[rustfmt::skip]
const LEN_EXTRA: [u8; 256] = [
Expand Down Expand Up @@ -1501,19 +1501,22 @@ fn compress_lz_codes(
let sym;
let num_extra_bits;

let match_len = lz_code_buf[i] as usize;
let match_len = lz_code_buf[i & LZ_CODE_BUF_MASK] as usize;

let match_dist = read_u16_le(lz_code_buf, i + 1);
let match_dist = lz_code_buf[(i + 1) & &LZ_CODE_BUF_MASK] as u16
| ((lz_code_buf[(i + 2) & LZ_CODE_BUF_MASK] as u16) << 8);

i += 3;

debug_assert!(huff.code_sizes[0][LEN_SYM[match_len] as usize] != 0);
debug_assert!(huff.code_sizes[0][LEN_SYM[match_len] as usize + LEN_SYM_OFFSET] != 0);
let len_sym = (LEN_SYM[match_len] & 31) as usize + LEN_SYM_OFFSET;

bb.put_fast(
u64::from(huff.codes[0][LEN_SYM[match_len] as usize]),
u32::from(huff.code_sizes[0][LEN_SYM[match_len] as usize]),
u64::from(huff.codes[0][len_sym]),
u32::from(huff.code_sizes[0][len_sym]),
);
bb.put_fast(
match_len as u64 & u64::from(BITMASKS[LEN_EXTRA[match_len] as usize]),
match_len as u64 & u64::from(BITMASKS[(LEN_EXTRA[match_len] & 7) as usize]),
u32::from(LEN_EXTRA[match_len]),
);

Expand All @@ -1531,14 +1534,14 @@ fn compress_lz_codes(
u32::from(huff.code_sizes[1][sym]),
);
bb.put_fast(
u64::from(match_dist) & u64::from(BITMASKS[num_extra_bits]),
u64::from(match_dist) & u64::from(BITMASKS[num_extra_bits & 15]),
num_extra_bits as u32,
);
} else {
// The lz code was a literal
for _ in 0..3 {
flags >>= 1;
let lit = lz_code_buf[i & (LZ_CODE_BUF_SIZE - 1)];
let lit = lz_code_buf[i & LZ_CODE_BUF_MASK];
i += 1;

debug_assert!(huff.code_sizes[0][lit as usize] != 0);
Expand Down Expand Up @@ -1729,14 +1732,14 @@ pub(crate) fn record_literal(h: &mut HuffmanOxide, lz: &mut LZOxide, lit: u8) {
h.count[0][lit as usize] += 1;
}

fn record_match(h: &mut HuffmanOxide, lz: &mut LZOxide, mut match_len: u32, mut match_dist: u32) {
fn record_match(h: &mut HuffmanOxide, lz: &mut LZOxide, match_len: u32, mut match_dist: u32) {
debug_assert!(match_len >= MIN_MATCH_LEN.into());
debug_assert!(match_dist >= 1);
debug_assert!(match_dist as usize <= LZ_DICT_SIZE);

lz.total_bytes += match_len;
match_dist -= 1;
match_len -= u32::from(MIN_MATCH_LEN);
let match_len = (match_len - u32::from(MIN_MATCH_LEN)) as u8;
lz.write_code(match_len as u8);
lz.write_code(match_dist as u8);
lz.write_code((match_dist >> 8) as u8);
Expand All @@ -1751,8 +1754,9 @@ fn record_match(h: &mut HuffmanOxide, lz: &mut LZOxide, mut match_len: u32, mut
LARGE_DIST_SYM[((match_dist >> 8) & 127) as usize]
} as usize;
h.count[1][symbol] += 1;
// Perf - go via u8 to help optimize out bounds check.
h.count[0][LEN_SYM[usize::from(match_len as u8)] as usize] += 1;
// Mask the values from LEN_SYM here as the compiler isn't quite smart enough to infer
// that it only contains values smaller than 32.
h.count[0][(LEN_SYM[match_len as usize] as usize & 31) + LEN_SYM_OFFSET] += 1;
}

fn compress_normal(d: &mut CompressorOxide, callback: &mut CallbackOxide) -> bool {
Expand Down Expand Up @@ -1802,6 +1806,7 @@ fn compress_normal(d: &mut CompressorOxide, callback: &mut CallbackOxide) -> boo
dst_pos = (dst_pos + 1) & LZ_DICT_SIZE_MASK;
ins_pos += 1;
}

src_pos += num_bytes_to_process;
} else {
let dictb = &mut d.dict.b;
Expand Down Expand Up @@ -2047,6 +2052,7 @@ fn compress_fast(d: &mut CompressorOxide, callback: &mut CallbackOxide) -> bool
// that ends after the end of the input data.
cur_match_len = cmp::min(cur_match_len, lookahead_size as u32);
debug_assert!(cur_match_len >= MIN_MATCH_LEN.into());
debug_assert!(cur_match_len <= MAX_MATCH_LEN as u32);
debug_assert!(cur_match_dist >= 1);
debug_assert!(cur_match_dist as usize <= LZ_DICT_SIZE);
cur_match_dist -= 1;
Expand All @@ -2064,8 +2070,11 @@ fn compress_fast(d: &mut CompressorOxide, callback: &mut CallbackOxide) -> bool
[LARGE_DIST_SYM[(cur_match_dist >> 8) as usize] as usize] += 1;
}

d.huff.count[0][LEN_SYM[(cur_match_len - u32::from(MIN_MATCH_LEN)) as usize]
as usize] += 1;
d.huff.count[0][(LEN_SYM
[(cur_match_len - u32::from(MIN_MATCH_LEN)) as usize & 255]
as usize
& 31)
+ LEN_SYM_OFFSET] += 1;
}
} else {
d.lz.write_code(first_trigram as u8);
Expand Down

0 comments on commit 5a65104

Please sign in to comment.