Skip to content

Commit a551b6c

Browse files
authored
Add a C99 test program (ridiculousfish#79)
* Add C99 test program * Fix C99 issues Change out static_cast to C-style cast Rename functions to avoid overloading: libdivide_s64_signbits_vec512 libdivide_s64_signbits_vec256 libdivide_s64_signbits_vec128
1 parent 3c231c1 commit a551b6c

File tree

5 files changed

+131
-26
lines changed

5 files changed

+131
-26
lines changed

CMakeLists.txt

+5
Original file line numberDiff line numberDiff line change
@@ -255,19 +255,24 @@ if (BUILD_TESTS)
255255
find_package(Threads REQUIRED QUIET)
256256

257257
add_executable(tester test/tester.cpp)
258+
add_executable(test_c99 test/test_c99.c)
258259
add_executable(benchmark test/benchmark.cpp)
259260
add_executable(benchmark_branchfree test/benchmark_branchfree.cpp)
260261

261262
target_link_libraries(tester libdivide Threads::Threads)
263+
target_link_libraries(test_c99 libdivide)
262264
target_link_libraries(benchmark libdivide)
263265
target_link_libraries(benchmark_branchfree libdivide)
264266

265267
target_compile_options(tester PRIVATE "${LIBDIVIDE_FLAGS}" "${NO_VECTORIZE}")
268+
target_compile_options(test_c99 PRIVATE "${LIBDIVIDE_FLAGS}" "${NO_VECTORIZE}")
266269
target_compile_options(benchmark PRIVATE "${LIBDIVIDE_FLAGS}" "${NO_VECTORIZE_C}")
267270
target_compile_options(benchmark_branchfree PRIVATE "${LIBDIVIDE_FLAGS}" "${NO_VECTORIZE}")
268271
set_property(TARGET benchmark_branchfree PROPERTY CXX_STANDARD 11)
272+
set_property(TARGET test_c99 PROPERTY C_STANDARD 99)
269273

270274
target_compile_definitions(tester PRIVATE "${LIBDIVIDE_ASSERTIONS}" "${LIBDIVIDE_VECTOR_EXT}")
275+
target_compile_definitions(test_c99 PRIVATE "${LIBDIVIDE_ASSERTIONS}" "${LIBDIVIDE_VECTOR_EXT}")
271276
target_compile_definitions(benchmark PRIVATE "${LIBDIVIDE_ASSERTIONS}" "${LIBDIVIDE_VECTOR_EXT}")
272277
target_compile_definitions(benchmark_branchfree PRIVATE "${LIBDIVIDE_ASSERTIONS}" "${LIBDIVIDE_VECTOR_EXT}")
273278
endif()

appveyor.yml

+6
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ for:
2929
- cppcheck . --error-exitcode=1 --force -i doc
3030
- ./tester
3131
- ./benchmark_branchfree
32+
- ./test_c99
3233

3334
- matrix:
3435
only:
@@ -46,6 +47,7 @@ for:
4647
- cppcheck . --error-exitcode=1 --force -i doc
4748
- ./tester
4849
- ./benchmark_branchfree
50+
- ./test_c99
4951

5052
- matrix:
5153
only:
@@ -58,6 +60,7 @@ for:
5860
- cd Release
5961
- tester.exe
6062
- benchmark_branchfree.exe
63+
- test_c99.exe
6164

6265
- matrix:
6366
only:
@@ -73,6 +76,7 @@ for:
7376
- cd Release
7477
- tester.exe
7578
- benchmark_branchfree.exe
79+
- test_c99.exe
7680

7781
- matrix:
7882
only:
@@ -85,6 +89,7 @@ for:
8589
- cd Release
8690
- tester.exe
8791
- benchmark_branchfree.exe
92+
- test_c99.exe
8893

8994
- matrix:
9095
only:
@@ -107,5 +112,6 @@ for:
107112
- cd Release
108113
- tester.exe
109114
- benchmark_branchfree.exe
115+
- test_c99.exe
110116
- cd ../test/avr
111117
- pio run -t Simulate -e megaatmega2560_Test

libdivide.code-workspace

+2-1
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,8 @@
8282
"xlocale": "cpp",
8383
"xlocbuf": "cpp",
8484
"xlocinfo": "cpp",
85-
"xlocmes": "cpp"
85+
"xlocmes": "cpp",
86+
"xmemory0": "c"
8687
}
8788
}
8889
}

libdivide.h

+25-25
Original file line numberDiff line numberDiff line change
@@ -362,7 +362,7 @@ static LIBDIVIDE_INLINE int16_t libdivide_count_leading_zeros16(uint16_t val) {
362362
#elif defined(LIBDIVIDE_VC)
363363
unsigned long result;
364364
if (_BitScanReverse(&result, (unsigned long)val)) {
365-
return static_cast<int16_t>(15 - result);
365+
return (int16_t)(15 - result);
366366
}
367367
return 0;
368368
#else
@@ -676,7 +676,7 @@ static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_internal_u16_gen(
676676
}
677677

678678
struct libdivide_u16_t result;
679-
uint8_t floor_log_2_d = static_cast<uint8_t>(15 - libdivide_count_leading_zeros16(d));
679+
uint8_t floor_log_2_d = (uint8_t)(15 - libdivide_count_leading_zeros16(d));
680680

681681
// Power of 2
682682
if ((d & (d - 1)) == 0) {
@@ -1183,7 +1183,7 @@ static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_internal_s16_gen(
11831183
if ((absD & (absD - 1)) == 0) {
11841184
// Branchfree and normal paths are exactly the same
11851185
result.magic = 0;
1186-
result.more = static_cast<uint8_t>(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));
1186+
result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));
11871187
} else {
11881188
LIBDIVIDE_ASSERT(floor_log_2_d >= 1);
11891189

@@ -1198,15 +1198,15 @@ static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_internal_s16_gen(
11981198
// This works if works if e < 2**floor_log_2_d.
11991199
if (!branchfree && e < ((uint16_t)1 << floor_log_2_d)) {
12001200
// This power works
1201-
more = static_cast<uint8_t>(floor_log_2_d - 1);
1201+
more = (uint8_t)(floor_log_2_d - 1);
12021202
} else {
12031203
// We need to go one higher. This should not make proposed_m
12041204
// overflow, but it will make it negative when interpreted as an
12051205
// int16_t.
12061206
proposed_m += proposed_m;
12071207
const uint16_t twice_rem = rem + rem;
12081208
if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
1209-
more = static_cast<uint8_t>(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
1209+
more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
12101210
}
12111211

12121212
proposed_m += 1;
@@ -1703,23 +1703,23 @@ static LIBDIVIDE_INLINE int64x2_t libdivide_s64_branchfree_do_vec128(
17031703
// Logical right shift by runtime value.
17041704
// NEON implements right shift as left shits by negative values.
17051705
static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_neon_srl(uint32x4_t v, uint8_t amt) {
1706-
int32_t wamt = static_cast<int32_t>(amt);
1706+
int32_t wamt = (int32_t)(amt);
17071707
return vshlq_u32(v, vdupq_n_s32(-wamt));
17081708
}
17091709

17101710
static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_neon_srl(uint64x2_t v, uint8_t amt) {
1711-
int64_t wamt = static_cast<int64_t>(amt);
1711+
int64_t wamt = (int64_t)(amt);
17121712
return vshlq_u64(v, vdupq_n_s64(-wamt));
17131713
}
17141714

17151715
// Arithmetic right shift by runtime value.
17161716
static LIBDIVIDE_INLINE int32x4_t libdivide_s32_neon_sra(int32x4_t v, uint8_t amt) {
1717-
int32_t wamt = static_cast<int32_t>(amt);
1717+
int32_t wamt = (int32_t)(amt);
17181718
return vshlq_s32(v, vdupq_n_s32(-wamt));
17191719
}
17201720

17211721
static LIBDIVIDE_INLINE int64x2_t libdivide_s64_neon_sra(int64x2_t v, uint8_t amt) {
1722-
int64_t wamt = static_cast<int64_t>(amt);
1722+
int64_t wamt = (int64_t)(amt);
17231723
return vshlq_s64(v, vdupq_n_s64(-wamt));
17241724
}
17251725

@@ -1771,7 +1771,7 @@ static LIBDIVIDE_INLINE uint64x2_t libdivide_mullhi_u64_vec128(uint64x2_t x, uin
17711771

17721772
static LIBDIVIDE_INLINE int64x2_t libdivide_mullhi_s64_vec128(int64x2_t x, int64_t sy) {
17731773
int64x2_t p = vreinterpretq_s64_u64(
1774-
libdivide_mullhi_u64_vec128(vreinterpretq_u64_s64(x), static_cast<uint64_t>(sy)));
1774+
libdivide_mullhi_u64_vec128(vreinterpretq_u64_s64(x), (uint64_t)(sy)));
17751775
int64x2_t y = vdupq_n_s64(sy);
17761776
int64x2_t t1 = vandq_s64(libdivide_s64_signbits(x), y);
17771777
int64x2_t t2 = vandq_s64(libdivide_s64_signbits(y), x);
@@ -1998,7 +1998,7 @@ static LIBDIVIDE_INLINE __m512i libdivide_s64_branchfree_do_vec512(
19981998

19991999
//////// Internal Utility Functions
20002000

2001-
static LIBDIVIDE_INLINE __m512i libdivide_s64_signbits(__m512i v) {
2001+
static LIBDIVIDE_INLINE __m512i libdivide_s64_signbits_vec512(__m512i v) {
20022002
;
20032003
return _mm512_srai_epi64(v, 63);
20042004
}
@@ -2051,8 +2051,8 @@ static LIBDIVIDE_INLINE __m512i libdivide_mullhi_u64_vec512(__m512i x, __m512i y
20512051
// y is one 64-bit value repeated.
20522052
static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s64_vec512(__m512i x, __m512i y) {
20532053
__m512i p = libdivide_mullhi_u64_vec512(x, y);
2054-
__m512i t1 = _mm512_and_si512(libdivide_s64_signbits(x), y);
2055-
__m512i t2 = _mm512_and_si512(libdivide_s64_signbits(y), x);
2054+
__m512i t1 = _mm512_and_si512(libdivide_s64_signbits_vec512(x), y);
2055+
__m512i t2 = _mm512_and_si512(libdivide_s64_signbits_vec512(y), x);
20562056
p = _mm512_sub_epi64(p, t1);
20572057
p = _mm512_sub_epi64(p, t2);
20582058
return p;
@@ -2196,7 +2196,7 @@ __m512i libdivide_s64_do_vec512(__m512i numers, const struct libdivide_s64_t *de
21962196
__m512i roundToZeroTweak = _mm512_set1_epi64(mask);
21972197
// q = numer + ((numer >> 63) & roundToZeroTweak);
21982198
__m512i q = _mm512_add_epi64(
2199-
numers, _mm512_and_si512(libdivide_s64_signbits(numers), roundToZeroTweak));
2199+
numers, _mm512_and_si512(libdivide_s64_signbits_vec512(numers), roundToZeroTweak));
22002200
q = libdivide_s64_shift_right_vec512(q, shift);
22012201
__m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
22022202
// q = (q ^ sign) - sign;
@@ -2233,7 +2233,7 @@ __m512i libdivide_s64_branchfree_do_vec512(
22332233
// If q is negative, we want to add either (2**shift)-1 if d is
22342234
// a power of 2, or (2**shift) if it is not a power of 2.
22352235
uint32_t is_power_of_2 = (magic == 0);
2236-
__m512i q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63
2236+
__m512i q_sign = libdivide_s64_signbits_vec512(q); // q_sign = q >> 63
22372237
__m512i mask = _mm512_set1_epi64(((uint64_t)1 << shift) - is_power_of_2);
22382238
q = _mm512_add_epi64(q, _mm512_and_si512(q_sign, mask)); // q = q + (q_sign & mask)
22392239
q = libdivide_s64_shift_right_vec512(q, shift); // q >>= shift
@@ -2274,7 +2274,7 @@ static LIBDIVIDE_INLINE __m256i libdivide_s64_branchfree_do_vec256(
22742274
//////// Internal Utility Functions
22752275

22762276
// Implementation of _mm256_srai_epi64(v, 63) (from AVX512).
2277-
static LIBDIVIDE_INLINE __m256i libdivide_s64_signbits(__m256i v) {
2277+
static LIBDIVIDE_INLINE __m256i libdivide_s64_signbits_vec256(__m256i v) {
22782278
__m256i hiBitsDuped = _mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
22792279
__m256i signBits = _mm256_srai_epi32(hiBitsDuped, 31);
22802280
return signBits;
@@ -2333,8 +2333,8 @@ static LIBDIVIDE_INLINE __m256i libdivide_mullhi_u64_vec256(__m256i x, __m256i y
23332333
// y is one 64-bit value repeated.
23342334
static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s64_vec256(__m256i x, __m256i y) {
23352335
__m256i p = libdivide_mullhi_u64_vec256(x, y);
2336-
__m256i t1 = _mm256_and_si256(libdivide_s64_signbits(x), y);
2337-
__m256i t2 = _mm256_and_si256(libdivide_s64_signbits(y), x);
2336+
__m256i t1 = _mm256_and_si256(libdivide_s64_signbits_vec256(x), y);
2337+
__m256i t2 = _mm256_and_si256(libdivide_s64_signbits_vec256(y), x);
23382338
p = _mm256_sub_epi64(p, t1);
23392339
p = _mm256_sub_epi64(p, t2);
23402340
return p;
@@ -2478,7 +2478,7 @@ __m256i libdivide_s64_do_vec256(__m256i numers, const struct libdivide_s64_t *de
24782478
__m256i roundToZeroTweak = _mm256_set1_epi64x(mask);
24792479
// q = numer + ((numer >> 63) & roundToZeroTweak);
24802480
__m256i q = _mm256_add_epi64(
2481-
numers, _mm256_and_si256(libdivide_s64_signbits(numers), roundToZeroTweak));
2481+
numers, _mm256_and_si256(libdivide_s64_signbits_vec256(numers), roundToZeroTweak));
24822482
q = libdivide_s64_shift_right_vec256(q, shift);
24832483
__m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
24842484
// q = (q ^ sign) - sign;
@@ -2515,7 +2515,7 @@ __m256i libdivide_s64_branchfree_do_vec256(
25152515
// If q is negative, we want to add either (2**shift)-1 if d is
25162516
// a power of 2, or (2**shift) if it is not a power of 2.
25172517
uint32_t is_power_of_2 = (magic == 0);
2518-
__m256i q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63
2518+
__m256i q_sign = libdivide_s64_signbits_vec256(q); // q_sign = q >> 63
25192519
__m256i mask = _mm256_set1_epi64x(((uint64_t)1 << shift) - is_power_of_2);
25202520
q = _mm256_add_epi64(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask)
25212521
q = libdivide_s64_shift_right_vec256(q, shift); // q >>= shift
@@ -2556,7 +2556,7 @@ static LIBDIVIDE_INLINE __m128i libdivide_s64_branchfree_do_vec128(
25562556
//////// Internal Utility Functions
25572557

25582558
// Implementation of _mm_srai_epi64(v, 63) (from AVX512).
2559-
static LIBDIVIDE_INLINE __m128i libdivide_s64_signbits(__m128i v) {
2559+
static LIBDIVIDE_INLINE __m128i libdivide_s64_signbits_vec128(__m128i v) {
25602560
__m128i hiBitsDuped = _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
25612561
__m128i signBits = _mm_srai_epi32(hiBitsDuped, 31);
25622562
return signBits;
@@ -2629,8 +2629,8 @@ static LIBDIVIDE_INLINE __m128i libdivide_mullhi_u64_vec128(__m128i x, __m128i y
26292629
// y is one 64-bit value repeated.
26302630
static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s64_vec128(__m128i x, __m128i y) {
26312631
__m128i p = libdivide_mullhi_u64_vec128(x, y);
2632-
__m128i t1 = _mm_and_si128(libdivide_s64_signbits(x), y);
2633-
__m128i t2 = _mm_and_si128(libdivide_s64_signbits(y), x);
2632+
__m128i t1 = _mm_and_si128(libdivide_s64_signbits_vec128(x), y);
2633+
__m128i t2 = _mm_and_si128(libdivide_s64_signbits_vec128(y), x);
26342634
p = _mm_sub_epi64(p, t1);
26352635
p = _mm_sub_epi64(p, t2);
26362636
return p;
@@ -2774,7 +2774,7 @@ __m128i libdivide_s64_do_vec128(__m128i numers, const struct libdivide_s64_t *de
27742774
__m128i roundToZeroTweak = _mm_set1_epi64x(mask);
27752775
// q = numer + ((numer >> 63) & roundToZeroTweak);
27762776
__m128i q =
2777-
_mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak));
2777+
_mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits_vec128(numers), roundToZeroTweak));
27782778
q = libdivide_s64_shift_right_vec128(q, shift);
27792779
__m128i sign = _mm_set1_epi32((int8_t)more >> 7);
27802780
// q = (q ^ sign) - sign;
@@ -2811,7 +2811,7 @@ __m128i libdivide_s64_branchfree_do_vec128(
28112811
// If q is negative, we want to add either (2**shift)-1 if d is
28122812
// a power of 2, or (2**shift) if it is not a power of 2.
28132813
uint32_t is_power_of_2 = (magic == 0);
2814-
__m128i q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63
2814+
__m128i q_sign = libdivide_s64_signbits_vec128(q); // q_sign = q >> 63
28152815
__m128i mask = _mm_set1_epi64x(((uint64_t)1 << shift) - is_power_of_2);
28162816
q = _mm_add_epi64(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask)
28172817
q = libdivide_s64_shift_right_vec128(q, shift); // q >>= shift

test/test_c99.c

+93
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
/*
2+
* A pure C test program. The point of this is to make sure libdivide
3+
* will compile as C only.
4+
*
5+
* Since the other programs have CPP extensions, they wil be compiled as C++. This
6+
* could allow C++ syntax or programming paradigms to inadvertently creep into the
7+
* code base.
8+
*/
9+
10+
#include <stdio.h>
11+
#include <inttypes.h>
12+
#include "libdivide.h"
13+
14+
#define UNUSED(x) (void)(x)
15+
#define MIN_RANGE (UINT16_MAX/4U)
16+
#define LOOP_STEP 3
17+
#define MAX(a,b) (((a) > (b)) ? (a) : (b))
18+
#define MIN(a,b) (((a) < (b)) ? (a) : (b))
19+
#define ABS(a) MAX(-a, a)
20+
21+
#define LOOP_START(denom) MIN(((denom*2)+LOOP_STEP), ((denom/2)+LOOP_STEP))
22+
#define LOOP_END(type, denom, range_max) MIN(MAX((type)MIN_RANGE, ABS(denom)*4), range_max-(LOOP_STEP*2))
23+
#define ASSERT_EQUAL(type, numer, denom, libdiv_result, native_result, format_spec) \
24+
if (libdiv_result!=native_result) { \
25+
fprintf(stderr, "Division fail: " #type ", %" format_spec "/%" format_spec ". Native: %" format_spec ", Libdivide %" format_spec "\n", numer, denom, native_result, libdiv_result); \
26+
}
27+
#define TEST_BODY(type, range_max, denom, divider, format_spec, OPERATION) \
28+
/* We need to be careful to have a wide enough range AND increment!=1 or else GCC figures out */ \
29+
/* this is a constant range and applies all sorts of optimizations */ \
30+
type loop = (type)LOOP_START(denom); \
31+
const type end = (type)LOOP_END(type, denom, range_max); \
32+
const type step = MAX(LOOP_STEP, (end-loop)/(2<<12)); \
33+
printf("Testing " #type ", %" format_spec " from %" format_spec " to %" format_spec ", step %" format_spec "\n", denom, loop, end, step); \
34+
for (; loop < end; loop+=step) \
35+
{ \
36+
type libdiv_result = OPERATION(loop); \
37+
type native_result = loop / denom; \
38+
ASSERT_EQUAL(type, loop, denom, libdiv_result, native_result, format_spec) \
39+
}
40+
41+
void test_u16() {
42+
uint16_t denom = (uint16_t)953; // Prime
43+
struct libdivide_u16_t divider = libdivide_u16_gen(denom);
44+
#define OP_U16(loop) libdivide_u16_do(loop, &divider)
45+
TEST_BODY(uint16_t, UINT16_MAX, denom, divider, PRIu16, OP_U16)
46+
}
47+
48+
void test_s16() {
49+
int16_t denom = (int16_t)-4003; // Prime
50+
struct libdivide_s16_t divider = libdivide_s16_gen(denom);
51+
#define OP_S16(loop) libdivide_s16_do(loop, &divider)
52+
TEST_BODY(int16_t, INT16_MAX, denom, divider, PRId16, OP_S16)
53+
}
54+
55+
void test_u32() {
56+
uint32_t denom = ((uint32_t)2 << 21) - 19; // Prime - see https://primes.utm.edu/lists/2small/0bit.html
57+
struct libdivide_u32_t divider = libdivide_u32_gen(denom);
58+
#define OP_U32(loop) libdivide_u32_do(loop, &divider)
59+
TEST_BODY(uint32_t, UINT32_MAX, denom, divider, PRIu32, OP_U32)
60+
}
61+
62+
void test_s32() {
63+
int32_t denom = -(((int32_t)2 << 21) - 55); // Prime - see https://primes.utm.edu/lists/2small/0bit.html
64+
struct libdivide_s32_t divider = libdivide_s32_gen(denom);
65+
#define OP_S32(loop) libdivide_s32_do(loop, &divider)
66+
TEST_BODY(int32_t, INT32_MAX, denom, divider, PRId32, OP_S32)
67+
}
68+
69+
void test_u64() {
70+
uint64_t denom = ((uint64_t)2 << 29) - 43; // Prime - see https://primes.utm.edu/lists/2small/0bit.html
71+
struct libdivide_u64_t divider = libdivide_u64_gen(denom);
72+
#define OP_U64(loop) libdivide_u64_do(loop, &divider)
73+
TEST_BODY(uint64_t, (UINT64_MAX/2) /* For speed */, denom, divider, PRIu64, OP_U64)
74+
}
75+
76+
void test_s64() {
77+
int64_t denom = -(((int64_t)2 << 29) - 121); // Prime - see https://primes.utm.edu/lists/2small/0bit.html
78+
struct libdivide_s64_t divider = libdivide_s64_gen(denom);
79+
#define OP_S64(loop) libdivide_s64_do(loop, &divider)
80+
TEST_BODY(int64_t, INT64_MAX, denom, divider, PRId64, OP_S64)
81+
}
82+
83+
int main (int argc, char *argv[]) {
84+
UNUSED(argc);
85+
UNUSED(argv);
86+
87+
test_u16();
88+
test_s16();
89+
test_u32();
90+
test_s32();
91+
test_u64();
92+
test_s64();
93+
}

0 commit comments

Comments
 (0)