Skip to content

Commit 9a44059

Browse files
nodejs-github-botaduh95
authored andcommitted
deps: update zlib to 1.3.0.1-motley-7d77fb7
PR-URL: #52516 Reviewed-By: Marco Ippolito <[email protected]> Reviewed-By: Mohammed Keyvanzadeh <[email protected]> Reviewed-By: Luigi Pinca <[email protected]>
1 parent 35643c1 commit 9a44059

File tree

10 files changed

+1032
-99
lines changed

10 files changed

+1032
-99
lines changed

deps/zlib/BUILD.gn

+30
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,36 @@ executable("zlib_bench") {
441441
configs += [ "//build/config/compiler:no_chromium_code" ]
442442
}
443443

444+
executable("minigzip") {
445+
include_dirs = [ "." ]
446+
447+
sources = [ "test/minigzip.c" ]
448+
if (!is_debug) {
449+
configs -= [ "//build/config/compiler:default_optimization" ]
450+
configs += [ "//build/config/compiler:optimize_speed" ]
451+
}
452+
453+
deps = [ ":zlib" ]
454+
455+
configs -= [ "//build/config/compiler:chromium_code" ]
456+
configs += [ "//build/config/compiler:no_chromium_code" ]
457+
}
458+
459+
executable("zpipe") {
460+
include_dirs = [ "." ]
461+
462+
sources = [ "examples/zpipe.c" ]
463+
if (!is_debug) {
464+
configs -= [ "//build/config/compiler:default_optimization" ]
465+
configs += [ "//build/config/compiler:optimize_speed" ]
466+
}
467+
468+
deps = [ ":zlib" ]
469+
470+
configs -= [ "//build/config/compiler:chromium_code" ]
471+
configs += [ "//build/config/compiler:no_chromium_code" ]
472+
}
473+
444474
if (!is_win || target_os != "winuwp") {
445475
executable("minizip_bin") {
446476
include_dirs = [ "." ]

deps/zlib/CMakeLists.txt

+33-4
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ option(ENABLE_SIMD_AVX512 "Enable SIMD AXV512 optimizations" OFF)
2626
option(USE_ZLIB_RABIN_KARP_HASH "Enable bitstream compatibility with canonical zlib" OFF)
2727
option(BUILD_UNITTESTS "Enable standalone unit tests build" OFF)
2828
option(BUILD_MINIZIP_BIN "Enable building minzip_bin tool" OFF)
29+
option(BUILD_ZPIPE "Enable building zpipe tool" OFF)
30+
option(BUILD_MINIGZIP "Enable building minigzip tool" OFF)
2931

3032
if (USE_ZLIB_RABIN_KARP_HASH)
3133
add_definitions(-DUSE_ZLIB_RABIN_KARP_ROLLING_HASH)
@@ -79,9 +81,16 @@ if (ENABLE_SIMD_OPTIMIZATIONS)
7981
add_definitions(-DRISCV_RVV)
8082
add_definitions(-DDEFLATE_SLIDE_HASH_RVV)
8183
add_definitions(-DADLER32_SIMD_RVV)
82-
#TODO(cavalcantii): add remaining flags as we port optimizations to RVV.
83-
# Required by CPU features detection code.
84-
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} --target=riscv64-unknown-linux-gnu -march=rv64gcv")
84+
85+
# TODO(cavalcantii): add remaining flags as we port optimizations to RVV.
86+
# chunk_copy is required for READ64 and unconditional decode of literals.
87+
add_definitions(-DINFLATE_CHUNK_GENERIC)
88+
add_definitions(-DINFLATE_CHUNK_READ_64LE)
89+
90+
# Tested with clang-17, unaligned loads are required by read64 & chunk_copy.
91+
# TODO(cavalcantii): replace internal clang flags for -munaligned-access
92+
# when we have a newer compiler available.
93+
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} --target=riscv64-unknown-linux-gnu -march=rv64gcv -Xclang -target-feature -Xclang +unaligned-scalar-mem")
8594
endif()
8695

8796
endif()
@@ -192,9 +201,14 @@ set(ZLIB_SRCS
192201
if (ENABLE_SIMD_OPTIMIZATIONS)
193202
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "riscv64")
194203
message("RISCVV: Add optimizations.")
204+
list(REMOVE_ITEM ZLIB_SRCS inflate.c)
195205
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.h)
206+
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/chunkcopy.h)
196207
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.h)
208+
197209
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.c)
210+
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inffast_chunk.c)
211+
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inflate.c)
198212
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.c)
199213
else()
200214
list(REMOVE_ITEM ZLIB_SRCS inflate.c)
@@ -339,7 +353,7 @@ if (BUILD_UNITTESTS)
339353
endif()
340354

341355
#============================================================================
342-
# Minigzip tool
356+
# Minizip tool
343357
#============================================================================
344358
# TODO(cavalcantii): get it working on Windows.
345359
if (BUILD_MINIZIP_BIN)
@@ -349,3 +363,18 @@ if (BUILD_MINIZIP_BIN)
349363
)
350364
target_link_libraries(minizip_bin zlib)
351365
endif()
366+
367+
#============================================================================
368+
# zpipe tool
369+
#============================================================================
370+
if (BUILD_ZPIPE)
371+
add_executable(zpipe examples/zpipe.c)
372+
target_link_libraries(zpipe zlib)
373+
endif()
374+
#============================================================================
375+
# MiniGzip tool
376+
#============================================================================
377+
if (BUILD_MINIGZIP)
378+
add_executable(minigzip_bin test/minigzip.c)
379+
target_link_libraries(minigzip_bin zlib)
380+
endif()

deps/zlib/adler32_simd.c

+76-90
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,6 @@
4141
* [2] zlib adler32_z() uses this fact to implement NMAX-block-based updates
4242
* of the adler s1 s2 of uint32_t type (see adler32.c).
4343
*/
44-
/* Copyright (C) 2023 SiFive, Inc. All rights reserved.
45-
* For conditions of distribution and use, see copyright notice in zlib.h
46-
*/
4744

4845
#include "adler32_simd.h"
4946

@@ -368,103 +365,92 @@ uint32_t ZLIB_INTERNAL adler32_simd_( /* NEON */
368365

369366
#elif defined(ADLER32_SIMD_RVV)
370367
#include <riscv_vector.h>
371-
/* adler32_rvv.c - RVV version of Adler-32
372-
* RVV 1.0 code contributed by Alex Chiang <[email protected]>
373-
* on https://github.com/zlib-ng/zlib-ng/pull/1532
374-
* Port from Simon Hosie's fork:
375-
* https://github.com/cloudflare/zlib/commit/40688b53c61cb9bfc36471acd2dc0800b7ebcab1
368+
369+
/*
370+
* Patch by Simon Hosie, from:
371+
* https://github.com/cloudflare/zlib/pull/55
376372
*/
377373

378374
uint32_t ZLIB_INTERNAL adler32_simd_( /* RVV */
379375
uint32_t adler,
380376
const unsigned char *buf,
381377
unsigned long len)
382378
{
383-
/* split Adler-32 into component sums */
384-
uint32_t sum2 = (adler >> 16) & 0xffff;
385-
adler &= 0xffff;
386-
387-
size_t left = len;
388-
size_t vl = __riscv_vsetvlmax_e8m1();
389-
vl = vl > 256 ? 256 : vl;
390-
vuint32m4_t v_buf32_accu = __riscv_vmv_v_x_u32m4(0, vl);
391-
vuint32m4_t v_adler32_prev_accu = __riscv_vmv_v_x_u32m4(0, vl);
392-
vuint16m2_t v_buf16_accu;
393-
394-
/*
395-
* We accumulate 8-bit data, and to prevent overflow, we have to use a 32-bit accumulator.
396-
* However, adding 8-bit data into a 32-bit accumulator isn't efficient. We use 16-bit & 32-bit
397-
* accumulators to boost performance.
398-
*
399-
* The block_size is the largest multiple of vl that <= 256, because overflow would occur when
400-
* vl > 256 (255 * 256 <= UINT16_MAX).
401-
*
402-
* We accumulate 8-bit data into a 16-bit accumulator and then
403-
* move the data into the 32-bit accumulator at the last iteration.
379+
size_t vl = __riscv_vsetvlmax_e8m2();
380+
const vuint16m4_t zero16 = __riscv_vmv_v_x_u16m4(0, vl);
381+
vuint16m4_t a_sum = zero16;
382+
vuint32m8_t b_sum = __riscv_vmv_v_x_u32m8(0, vl);
383+
384+
/* Deal with the part which is not a multiple of vl first; because it's
385+
* easier to zero-stuff the beginning of the checksum than it is to tweak the
386+
* multipliers and sums for odd lengths afterwards.
387+
*/
388+
size_t head = len & (vl - 1);
389+
if (head > 0) {
390+
vuint8m2_t zero8 = __riscv_vmv_v_x_u8m2(0, vl);
391+
vuint8m2_t in = __riscv_vle8_v_u8m2(buf, vl);
392+
in = __riscv_vslideup(zero8, in, vl - head, vl);
393+
vuint16m4_t in16 = __riscv_vwcvtu_x(in, vl);
394+
a_sum = in16;
395+
buf += head;
396+
}
397+
398+
/* We have a 32-bit accumulator, and in each iteration we add 22-times a
399+
* 16-bit value, plus another 16-bit value. We periodically subtract up to
400+
* 65535 times BASE to avoid overflow. b_overflow estimates how often we
401+
* need to do this subtraction.
402+
*/
403+
const int b_overflow = BASE / 23;
404+
int fixup = b_overflow;
405+
ssize_t iters = (len - head) / vl;
406+
while (iters > 0) {
407+
const vuint16m4_t a_overflow = __riscv_vrsub(a_sum, BASE, vl);
408+
int batch = iters < 22 ? iters : 22;
409+
iters -= batch;
410+
b_sum = __riscv_vwmaccu(b_sum, batch, a_sum, vl);
411+
vuint16m4_t a_batch = zero16, b_batch = zero16;
412+
413+
/* Do a short batch, where neither a_sum nor b_sum can overflow a 16-bit
414+
* register. Then add them back into the main accumulators.
404415
*/
405-
size_t block_size = (256 / vl) * vl;
406-
size_t nmax_limit = (NMAX / block_size);
407-
size_t cnt = 0;
408-
while (left >= block_size) {
409-
v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
410-
size_t subprob = block_size;
411-
while (subprob > 0) {
412-
vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl);
413-
v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
414-
v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
415-
buf += vl;
416-
subprob -= vl;
417-
}
418-
v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, block_size / vl, v_buf32_accu, vl);
419-
v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
420-
left -= block_size;
421-
/* do modulo once each block of NMAX size */
422-
if (++cnt >= nmax_limit) {
423-
v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
424-
cnt = 0;
425-
}
416+
while (batch-- > 0) {
417+
vuint8m2_t in8 = __riscv_vle8_v_u8m2(buf, vl);
418+
buf += vl;
419+
b_batch = __riscv_vadd(b_batch, a_batch, vl);
420+
a_batch = __riscv_vwaddu_wv(a_batch, in8, vl);
426421
}
427-
/* the left len <= 256 now, we can use 16-bit accum safely */
428-
v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
429-
size_t res = left;
430-
while (left >= vl) {
431-
vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl);
432-
v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
433-
v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
434-
buf += vl;
435-
left -= vl;
422+
vbool4_t ov = __riscv_vmsgeu(a_batch, a_overflow, vl);
423+
a_sum = __riscv_vadd(a_sum, a_batch, vl);
424+
a_sum = __riscv_vadd_mu(ov, a_sum, a_sum, 65536 - BASE, vl);
425+
b_sum = __riscv_vwaddu_wv(b_sum, b_batch, vl);
426+
if (--fixup <= 0) {
427+
b_sum = __riscv_vnmsac(b_sum, BASE, __riscv_vsrl(b_sum, 16, vl), vl);
428+
fixup = b_overflow;
436429
}
437-
v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, res / vl, v_buf32_accu, vl);
438-
v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
439-
v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
440-
441-
vuint32m4_t v_seq = __riscv_vid_v_u32m4(vl);
442-
vuint32m4_t v_rev_seq = __riscv_vrsub_vx_u32m4(v_seq, vl, vl);
443-
vuint32m4_t v_sum32_accu = __riscv_vmul_vv_u32m4(v_buf32_accu, v_rev_seq, vl);
444-
445-
v_sum32_accu = __riscv_vadd_vv_u32m4(v_sum32_accu, __riscv_vmul_vx_u32m4(v_adler32_prev_accu, vl, vl), vl);
446-
447-
vuint32m1_t v_sum2_sum = __riscv_vmv_s_x_u32m1(0, vl);
448-
v_sum2_sum = __riscv_vredsum_vs_u32m4_u32m1(v_sum32_accu, v_sum2_sum, vl);
449-
uint32_t sum2_sum = __riscv_vmv_x_s_u32m1_u32(v_sum2_sum);
450-
451-
sum2 += (sum2_sum + adler * (len - left));
452-
453-
vuint32m1_t v_adler_sum = __riscv_vmv_s_x_u32m1(0, vl);
454-
v_adler_sum = __riscv_vredsum_vs_u32m4_u32m1(v_buf32_accu, v_adler_sum, vl);
455-
uint32_t adler_sum = __riscv_vmv_x_s_u32m1_u32(v_adler_sum);
456-
457-
adler += adler_sum;
458-
459-
while (left--) {
460-
adler += *buf++;
461-
sum2 += adler;
462-
}
463-
464-
sum2 %= BASE;
465-
adler %= BASE;
466-
467-
return adler | (sum2 << 16);
430+
}
431+
/* Adjust per-lane sums to have appropriate offsets from the end of the
432+
* buffer.
433+
*/
434+
const vuint16m4_t off = __riscv_vrsub(__riscv_vid_v_u16m4(vl), vl, vl);
435+
vuint16m4_t bsum16 = __riscv_vncvt_x(__riscv_vremu(b_sum, BASE, vl), vl);
436+
b_sum = __riscv_vadd(__riscv_vwmulu(a_sum, off, vl),
437+
__riscv_vwmulu(bsum16, vl, vl), vl);
438+
bsum16 = __riscv_vncvt_x(__riscv_vremu(b_sum, BASE, vl), vl);
439+
440+
/* And finally, do a horizontal sum across the registers for the final
441+
* result.
442+
*/
443+
uint32_t a = adler & 0xffff;
444+
uint32_t b = ((adler >> 16) + a * (len % BASE)) % BASE;
445+
vuint32m1_t sca = __riscv_vmv_v_x_u32m1(a, 1);
446+
vuint32m1_t scb = __riscv_vmv_v_x_u32m1(b, 1);
447+
sca = __riscv_vwredsumu(a_sum, sca, vl);
448+
scb = __riscv_vwredsumu(bsum16, scb, vl);
449+
a = __riscv_vmv_x(sca);
450+
b = __riscv_vmv_x(scb);
451+
a %= BASE;
452+
b %= BASE;
453+
return (b << 16) | a;
468454
}
469455

470456
#endif /* ADLER32_SIMD_SSSE3 */

deps/zlib/contrib/optimizations/chunkcopy.h

+75
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,10 @@
2121

2222
#if defined(__clang__) || defined(__GNUC__) || defined(__llvm__)
2323
#define Z_BUILTIN_MEMCPY __builtin_memcpy
24+
#define Z_BUILTIN_MEMSET __builtin_memset
2425
#else
2526
#define Z_BUILTIN_MEMCPY zmemcpy
27+
#define Z_BUILTIN_MEMSET zmemset
2628
#endif
2729

2830
#if defined(INFLATE_CHUNK_SIMD_NEON)
@@ -31,6 +33,8 @@ typedef uint8x16_t z_vec128i_t;
3133
#elif defined(INFLATE_CHUNK_SIMD_SSE2)
3234
#include <emmintrin.h>
3335
typedef __m128i z_vec128i_t;
36+
#elif defined(INFLATE_CHUNK_GENERIC)
37+
typedef struct { uint8_t x[16]; } z_vec128i_t;
3438
#else
3539
#error chunkcopy.h inflate chunk SIMD is not defined for your build target
3640
#endif
@@ -265,6 +269,77 @@ static inline z_vec128i_t v_load8_dup(const void* src) {
265269
static inline void v_store_128(void* out, const z_vec128i_t vec) {
266270
_mm_storeu_si128((__m128i*)out, vec);
267271
}
272+
#elif defined(INFLATE_CHUNK_GENERIC)
273+
/*
274+
* Default implementations for chunk-copy functions rely on memcpy() being
275+
* inlined by the compiler for best performance. This is most likely to work
276+
* as expected when the length argument is constant (as is the case here) and
277+
* the target supports unaligned loads and stores. Since that's not always a
278+
* safe assumption, this may need extra compiler arguments such as
279+
* `-mno-strict-align` or `-munaligned-access`, or the availability of
280+
* extensions like SIMD.
281+
*/
282+
283+
/*
284+
* v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in
285+
* every 64-bit component of the 128-bit result (64-bit int splat).
286+
*/
287+
static inline z_vec128i_t v_load64_dup(const void* src) {
288+
int64_t in;
289+
Z_BUILTIN_MEMCPY(&in, src, sizeof(in));
290+
z_vec128i_t out;
291+
for (int i = 0; i < sizeof(out); i += sizeof(in)) {
292+
Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in));
293+
}
294+
return out;
295+
}
296+
297+
/*
298+
* v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in
299+
* every 32-bit component of the 128-bit result (32-bit int splat).
300+
*/
301+
static inline z_vec128i_t v_load32_dup(const void* src) {
302+
int32_t in;
303+
Z_BUILTIN_MEMCPY(&in, src, sizeof(in));
304+
z_vec128i_t out;
305+
for (int i = 0; i < sizeof(out); i += sizeof(in)) {
306+
Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in));
307+
}
308+
return out;
309+
}
310+
311+
/*
312+
* v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in
313+
* every 16-bit component of the 128-bit result (16-bit int splat).
314+
*/
315+
static inline z_vec128i_t v_load16_dup(const void* src) {
316+
int16_t in;
317+
Z_BUILTIN_MEMCPY(&in, src, sizeof(in));
318+
z_vec128i_t out;
319+
for (int i = 0; i < sizeof(out); i += sizeof(in)) {
320+
Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in));
321+
}
322+
return out;
323+
}
324+
325+
/*
326+
* v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit
327+
* component of the 128-bit result (8-bit int splat).
328+
*/
329+
static inline z_vec128i_t v_load8_dup(const void* src) {
330+
int8_t in = *(const uint8_t*)src;
331+
z_vec128i_t out;
332+
Z_BUILTIN_MEMSET(&out, in, sizeof(out));
333+
return out;
334+
}
335+
336+
/*
337+
* v_store_128(): store the 128-bit vec in a memory destination (that might
338+
* not be 16-byte aligned) void* out.
339+
*/
340+
static inline void v_store_128(void* out, const z_vec128i_t vec) {
341+
Z_BUILTIN_MEMCPY(out, &vec, sizeof(vec));
342+
}
268343
#endif
269344

270345
/*

0 commit comments

Comments
 (0)