@@ -362,7 +362,7 @@ static LIBDIVIDE_INLINE int16_t libdivide_count_leading_zeros16(uint16_t val) {
362
362
#elif defined(LIBDIVIDE_VC)
363
363
unsigned long result;
364
364
if (_BitScanReverse (&result, (unsigned long )val)) {
365
- return static_cast < int16_t > (15 - result);
365
+ return ( int16_t ) (15 - result);
366
366
}
367
367
return 0 ;
368
368
#else
@@ -676,7 +676,7 @@ static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_internal_u16_gen(
676
676
}
677
677
678
678
struct libdivide_u16_t result;
679
- uint8_t floor_log_2_d = static_cast < uint8_t > (15 - libdivide_count_leading_zeros16 (d));
679
+ uint8_t floor_log_2_d = ( uint8_t ) (15 - libdivide_count_leading_zeros16 (d));
680
680
681
681
// Power of 2
682
682
if ((d & (d - 1 )) == 0 ) {
@@ -1183,7 +1183,7 @@ static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_internal_s16_gen(
1183
1183
if ((absD & (absD - 1 )) == 0 ) {
1184
1184
// Branchfree and normal paths are exactly the same
1185
1185
result.magic = 0 ;
1186
- result.more = static_cast < uint8_t > (floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0 ));
1186
+ result.more = ( uint8_t ) (floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0 ));
1187
1187
} else {
1188
1188
LIBDIVIDE_ASSERT (floor_log_2_d >= 1 );
1189
1189
@@ -1198,15 +1198,15 @@ static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_internal_s16_gen(
1198
1198
// This works if works if e < 2**floor_log_2_d.
1199
1199
if (!branchfree && e < ((uint16_t )1 << floor_log_2_d)) {
1200
1200
// This power works
1201
- more = static_cast < uint8_t > (floor_log_2_d - 1 );
1201
+ more = ( uint8_t ) (floor_log_2_d - 1 );
1202
1202
} else {
1203
1203
// We need to go one higher. This should not make proposed_m
1204
1204
// overflow, but it will make it negative when interpreted as an
1205
1205
// int16_t.
1206
1206
proposed_m += proposed_m;
1207
1207
const uint16_t twice_rem = rem + rem;
1208
1208
if (twice_rem >= absD || twice_rem < rem) proposed_m += 1 ;
1209
- more = static_cast < uint8_t > (floor_log_2_d | LIBDIVIDE_ADD_MARKER);
1209
+ more = ( uint8_t ) (floor_log_2_d | LIBDIVIDE_ADD_MARKER);
1210
1210
}
1211
1211
1212
1212
proposed_m += 1 ;
@@ -1703,23 +1703,23 @@ static LIBDIVIDE_INLINE int64x2_t libdivide_s64_branchfree_do_vec128(
1703
1703
// Logical right shift by runtime value.
1704
1704
// NEON implements right shift as left shits by negative values.
1705
1705
static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_neon_srl (uint32x4_t v, uint8_t amt) {
1706
- int32_t wamt = static_cast < int32_t > (amt);
1706
+ int32_t wamt = ( int32_t ) (amt);
1707
1707
return vshlq_u32 (v, vdupq_n_s32 (-wamt));
1708
1708
}
1709
1709
1710
1710
static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_neon_srl (uint64x2_t v, uint8_t amt) {
1711
- int64_t wamt = static_cast < int64_t > (amt);
1711
+ int64_t wamt = ( int64_t ) (amt);
1712
1712
return vshlq_u64 (v, vdupq_n_s64 (-wamt));
1713
1713
}
1714
1714
1715
1715
// Arithmetic right shift by runtime value.
1716
1716
static LIBDIVIDE_INLINE int32x4_t libdivide_s32_neon_sra (int32x4_t v, uint8_t amt) {
1717
- int32_t wamt = static_cast < int32_t > (amt);
1717
+ int32_t wamt = ( int32_t ) (amt);
1718
1718
return vshlq_s32 (v, vdupq_n_s32 (-wamt));
1719
1719
}
1720
1720
1721
1721
static LIBDIVIDE_INLINE int64x2_t libdivide_s64_neon_sra (int64x2_t v, uint8_t amt) {
1722
- int64_t wamt = static_cast < int64_t > (amt);
1722
+ int64_t wamt = ( int64_t ) (amt);
1723
1723
return vshlq_s64 (v, vdupq_n_s64 (-wamt));
1724
1724
}
1725
1725
@@ -1771,7 +1771,7 @@ static LIBDIVIDE_INLINE uint64x2_t libdivide_mullhi_u64_vec128(uint64x2_t x, uin
1771
1771
1772
1772
static LIBDIVIDE_INLINE int64x2_t libdivide_mullhi_s64_vec128 (int64x2_t x, int64_t sy) {
1773
1773
int64x2_t p = vreinterpretq_s64_u64 (
1774
- libdivide_mullhi_u64_vec128 (vreinterpretq_u64_s64 (x), static_cast < uint64_t > (sy)));
1774
+ libdivide_mullhi_u64_vec128 (vreinterpretq_u64_s64 (x), ( uint64_t ) (sy)));
1775
1775
int64x2_t y = vdupq_n_s64 (sy);
1776
1776
int64x2_t t1 = vandq_s64 (libdivide_s64_signbits (x), y);
1777
1777
int64x2_t t2 = vandq_s64 (libdivide_s64_signbits (y), x);
@@ -1998,7 +1998,7 @@ static LIBDIVIDE_INLINE __m512i libdivide_s64_branchfree_do_vec512(
1998
1998
1999
1999
// ////// Internal Utility Functions
2000
2000
2001
- static LIBDIVIDE_INLINE __m512i libdivide_s64_signbits (__m512i v) {
2001
+ static LIBDIVIDE_INLINE __m512i libdivide_s64_signbits_vec512 (__m512i v) {
2002
2002
;
2003
2003
return _mm512_srai_epi64 (v, 63 );
2004
2004
}
@@ -2051,8 +2051,8 @@ static LIBDIVIDE_INLINE __m512i libdivide_mullhi_u64_vec512(__m512i x, __m512i y
2051
2051
// y is one 64-bit value repeated.
2052
2052
static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s64_vec512 (__m512i x, __m512i y) {
2053
2053
__m512i p = libdivide_mullhi_u64_vec512 (x, y);
2054
- __m512i t1 = _mm512_and_si512 (libdivide_s64_signbits (x), y);
2055
- __m512i t2 = _mm512_and_si512 (libdivide_s64_signbits (y), x);
2054
+ __m512i t1 = _mm512_and_si512 (libdivide_s64_signbits_vec512 (x), y);
2055
+ __m512i t2 = _mm512_and_si512 (libdivide_s64_signbits_vec512 (y), x);
2056
2056
p = _mm512_sub_epi64 (p, t1);
2057
2057
p = _mm512_sub_epi64 (p, t2);
2058
2058
return p;
@@ -2196,7 +2196,7 @@ __m512i libdivide_s64_do_vec512(__m512i numers, const struct libdivide_s64_t *de
2196
2196
__m512i roundToZeroTweak = _mm512_set1_epi64 (mask);
2197
2197
// q = numer + ((numer >> 63) & roundToZeroTweak);
2198
2198
__m512i q = _mm512_add_epi64 (
2199
- numers, _mm512_and_si512 (libdivide_s64_signbits (numers), roundToZeroTweak));
2199
+ numers, _mm512_and_si512 (libdivide_s64_signbits_vec512 (numers), roundToZeroTweak));
2200
2200
q = libdivide_s64_shift_right_vec512 (q, shift);
2201
2201
__m512i sign = _mm512_set1_epi32 ((int8_t )more >> 7 );
2202
2202
// q = (q ^ sign) - sign;
@@ -2233,7 +2233,7 @@ __m512i libdivide_s64_branchfree_do_vec512(
2233
2233
// If q is negative, we want to add either (2**shift)-1 if d is
2234
2234
// a power of 2, or (2**shift) if it is not a power of 2.
2235
2235
uint32_t is_power_of_2 = (magic == 0 );
2236
- __m512i q_sign = libdivide_s64_signbits (q); // q_sign = q >> 63
2236
+ __m512i q_sign = libdivide_s64_signbits_vec512 (q); // q_sign = q >> 63
2237
2237
__m512i mask = _mm512_set1_epi64 (((uint64_t )1 << shift) - is_power_of_2);
2238
2238
q = _mm512_add_epi64 (q, _mm512_and_si512 (q_sign, mask)); // q = q + (q_sign & mask)
2239
2239
q = libdivide_s64_shift_right_vec512 (q, shift); // q >>= shift
@@ -2274,7 +2274,7 @@ static LIBDIVIDE_INLINE __m256i libdivide_s64_branchfree_do_vec256(
2274
2274
// ////// Internal Utility Functions
2275
2275
2276
2276
// Implementation of _mm256_srai_epi64(v, 63) (from AVX512).
2277
- static LIBDIVIDE_INLINE __m256i libdivide_s64_signbits (__m256i v) {
2277
+ static LIBDIVIDE_INLINE __m256i libdivide_s64_signbits_vec256 (__m256i v) {
2278
2278
__m256i hiBitsDuped = _mm256_shuffle_epi32 (v, _MM_SHUFFLE (3 , 3 , 1 , 1 ));
2279
2279
__m256i signBits = _mm256_srai_epi32 (hiBitsDuped, 31 );
2280
2280
return signBits;
@@ -2333,8 +2333,8 @@ static LIBDIVIDE_INLINE __m256i libdivide_mullhi_u64_vec256(__m256i x, __m256i y
2333
2333
// y is one 64-bit value repeated.
2334
2334
static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s64_vec256 (__m256i x, __m256i y) {
2335
2335
__m256i p = libdivide_mullhi_u64_vec256 (x, y);
2336
- __m256i t1 = _mm256_and_si256 (libdivide_s64_signbits (x), y);
2337
- __m256i t2 = _mm256_and_si256 (libdivide_s64_signbits (y), x);
2336
+ __m256i t1 = _mm256_and_si256 (libdivide_s64_signbits_vec256 (x), y);
2337
+ __m256i t2 = _mm256_and_si256 (libdivide_s64_signbits_vec256 (y), x);
2338
2338
p = _mm256_sub_epi64 (p, t1);
2339
2339
p = _mm256_sub_epi64 (p, t2);
2340
2340
return p;
@@ -2478,7 +2478,7 @@ __m256i libdivide_s64_do_vec256(__m256i numers, const struct libdivide_s64_t *de
2478
2478
__m256i roundToZeroTweak = _mm256_set1_epi64x (mask);
2479
2479
// q = numer + ((numer >> 63) & roundToZeroTweak);
2480
2480
__m256i q = _mm256_add_epi64 (
2481
- numers, _mm256_and_si256 (libdivide_s64_signbits (numers), roundToZeroTweak));
2481
+ numers, _mm256_and_si256 (libdivide_s64_signbits_vec256 (numers), roundToZeroTweak));
2482
2482
q = libdivide_s64_shift_right_vec256 (q, shift);
2483
2483
__m256i sign = _mm256_set1_epi32 ((int8_t )more >> 7 );
2484
2484
// q = (q ^ sign) - sign;
@@ -2515,7 +2515,7 @@ __m256i libdivide_s64_branchfree_do_vec256(
2515
2515
// If q is negative, we want to add either (2**shift)-1 if d is
2516
2516
// a power of 2, or (2**shift) if it is not a power of 2.
2517
2517
uint32_t is_power_of_2 = (magic == 0 );
2518
- __m256i q_sign = libdivide_s64_signbits (q); // q_sign = q >> 63
2518
+ __m256i q_sign = libdivide_s64_signbits_vec256 (q); // q_sign = q >> 63
2519
2519
__m256i mask = _mm256_set1_epi64x (((uint64_t )1 << shift) - is_power_of_2);
2520
2520
q = _mm256_add_epi64 (q, _mm256_and_si256 (q_sign, mask)); // q = q + (q_sign & mask)
2521
2521
q = libdivide_s64_shift_right_vec256 (q, shift); // q >>= shift
@@ -2556,7 +2556,7 @@ static LIBDIVIDE_INLINE __m128i libdivide_s64_branchfree_do_vec128(
2556
2556
// ////// Internal Utility Functions
2557
2557
2558
2558
// Implementation of _mm_srai_epi64(v, 63) (from AVX512).
2559
- static LIBDIVIDE_INLINE __m128i libdivide_s64_signbits (__m128i v) {
2559
+ static LIBDIVIDE_INLINE __m128i libdivide_s64_signbits_vec128 (__m128i v) {
2560
2560
__m128i hiBitsDuped = _mm_shuffle_epi32 (v, _MM_SHUFFLE (3 , 3 , 1 , 1 ));
2561
2561
__m128i signBits = _mm_srai_epi32 (hiBitsDuped, 31 );
2562
2562
return signBits;
@@ -2629,8 +2629,8 @@ static LIBDIVIDE_INLINE __m128i libdivide_mullhi_u64_vec128(__m128i x, __m128i y
2629
2629
// y is one 64-bit value repeated.
2630
2630
static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s64_vec128 (__m128i x, __m128i y) {
2631
2631
__m128i p = libdivide_mullhi_u64_vec128 (x, y);
2632
- __m128i t1 = _mm_and_si128 (libdivide_s64_signbits (x), y);
2633
- __m128i t2 = _mm_and_si128 (libdivide_s64_signbits (y), x);
2632
+ __m128i t1 = _mm_and_si128 (libdivide_s64_signbits_vec128 (x), y);
2633
+ __m128i t2 = _mm_and_si128 (libdivide_s64_signbits_vec128 (y), x);
2634
2634
p = _mm_sub_epi64 (p, t1);
2635
2635
p = _mm_sub_epi64 (p, t2);
2636
2636
return p;
@@ -2774,7 +2774,7 @@ __m128i libdivide_s64_do_vec128(__m128i numers, const struct libdivide_s64_t *de
2774
2774
__m128i roundToZeroTweak = _mm_set1_epi64x (mask);
2775
2775
// q = numer + ((numer >> 63) & roundToZeroTweak);
2776
2776
__m128i q =
2777
- _mm_add_epi64 (numers, _mm_and_si128 (libdivide_s64_signbits (numers), roundToZeroTweak));
2777
+ _mm_add_epi64 (numers, _mm_and_si128 (libdivide_s64_signbits_vec128 (numers), roundToZeroTweak));
2778
2778
q = libdivide_s64_shift_right_vec128 (q, shift);
2779
2779
__m128i sign = _mm_set1_epi32 ((int8_t )more >> 7 );
2780
2780
// q = (q ^ sign) - sign;
@@ -2811,7 +2811,7 @@ __m128i libdivide_s64_branchfree_do_vec128(
2811
2811
// If q is negative, we want to add either (2**shift)-1 if d is
2812
2812
// a power of 2, or (2**shift) if it is not a power of 2.
2813
2813
uint32_t is_power_of_2 = (magic == 0 );
2814
- __m128i q_sign = libdivide_s64_signbits (q); // q_sign = q >> 63
2814
+ __m128i q_sign = libdivide_s64_signbits_vec128 (q); // q_sign = q >> 63
2815
2815
__m128i mask = _mm_set1_epi64x (((uint64_t )1 << shift) - is_power_of_2);
2816
2816
q = _mm_add_epi64 (q, _mm_and_si128 (q_sign, mask)); // q = q + (q_sign & mask)
2817
2817
q = libdivide_s64_shift_right_vec128 (q, shift); // q >>= shift
0 commit comments