39
39
40
40
namespace Stockfish ::Eval::NNUE::Layers {
41
41
42
+ #if defined(USE_SSSE3) || defined(USE_NEON_DOTPROD)
43
+ #define ENABLE_SEQ_OPT
44
+ #endif
45
+
42
46
// Fallback implementation for older/other architectures.
43
47
// Requires the input to be padded to at least 16 values.
44
- #if !defined(USE_SSSE3)
48
+ #ifndef ENABLE_SEQ_OPT
49
+
45
50
template <IndexType InputDimensions, IndexType PaddedInputDimensions, IndexType OutputDimensions>
46
51
static void affine_transform_non_ssse3 (std::int32_t * output,
47
52
const std::int8_t * weights,
48
53
const std::int32_t * biases,
49
54
const std::uint8_t * input) {
50
- #if defined(USE_SSE2) || defined(USE_NEON_DOTPROD) || defined( USE_NEON)
55
+ #if defined(USE_SSE2) || defined(USE_NEON)
51
56
#if defined(USE_SSE2)
52
57
// At least a multiple of 16, with SSE2.
53
58
constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 16 ) / 16 ;
54
59
const __m128i Zeros = _mm_setzero_si128 ();
55
60
const auto inputVector = reinterpret_cast <const __m128i*>(input);
56
61
57
- #elif defined(USE_NEON_DOTPROD)
58
- constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 16 ) / 16 ;
59
- const auto inputVector = reinterpret_cast <const int8x16_t *>(input);
60
-
61
62
#elif defined(USE_NEON)
62
63
constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 16 ) / 16 ;
63
64
const auto inputVector = reinterpret_cast <const int8x8_t *>(input);
@@ -91,16 +92,8 @@ static void affine_transform_non_ssse3(std::int32_t* output,
91
92
sum = _mm_add_epi32 (sum, sum_second_32);
92
93
output[i] = _mm_cvtsi128_si32 (sum);
93
94
94
- #elif defined(USE_NEON_DOTPROD)
95
- int32x4_t sum = {biases[i]};
96
- const auto row = reinterpret_cast <const int8x16_t *>(&weights[offset]);
97
- for (IndexType j = 0 ; j < NumChunks; ++j)
98
- {
99
- sum = vdotq_s32 (sum, inputVector[j], row[j]);
100
- }
101
- output[i] = vaddvq_s32 (sum);
102
-
103
95
#elif defined(USE_NEON)
96
+
104
97
int32x4_t sum = {biases[i]};
105
98
const auto row = reinterpret_cast <const int8x8_t *>(&weights[offset]);
106
99
for (IndexType j = 0 ; j < NumChunks; ++j)
@@ -127,7 +120,8 @@ static void affine_transform_non_ssse3(std::int32_t* output,
127
120
}
128
121
#endif
129
122
}
130
- #endif
123
+
124
+ #endif // !ENABLE_SEQ_OPT
131
125
132
126
template <IndexType InDims, IndexType OutDims>
133
127
class AffineTransform {
@@ -162,7 +156,7 @@ class AffineTransform {
162
156
}
163
157
164
158
static constexpr IndexType get_weight_index (IndexType i) {
165
- #if defined(USE_SSSE3)
159
+ #ifdef ENABLE_SEQ_OPT
166
160
return get_weight_index_scrambled (i);
167
161
#else
168
162
return i;
@@ -190,29 +184,28 @@ class AffineTransform {
190
184
// Forward propagation
191
185
void propagate (const InputType* input, OutputType* output) const {
192
186
193
- #if defined(USE_SSSE3)
187
+ #ifdef ENABLE_SEQ_OPT
194
188
195
189
if constexpr (OutputDimensions > 1 )
196
190
{
197
-
198
191
#if defined(USE_AVX512)
199
192
using vec_t = __m512i;
200
- #define vec_setzero _mm512_setzero_si512
201
193
#define vec_set_32 _mm512_set1_epi32
202
194
#define vec_add_dpbusd_32 Simd::m512_add_dpbusd_epi32
203
- #define vec_hadd Simd::m512_hadd
204
195
#elif defined(USE_AVX2)
205
196
using vec_t = __m256i;
206
- #define vec_setzero _mm256_setzero_si256
207
197
#define vec_set_32 _mm256_set1_epi32
208
198
#define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32
209
- #define vec_hadd Simd::m256_hadd
210
199
#elif defined(USE_SSSE3)
211
200
using vec_t = __m128i;
212
- #define vec_setzero _mm_setzero_si128
213
201
#define vec_set_32 _mm_set1_epi32
214
202
#define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32
215
- #define vec_hadd Simd::m128_hadd
203
+ #elif defined(USE_NEON_DOTPROD)
204
+ using vec_t = int32x4_t ;
205
+ #define vec_set_32 vdupq_n_s32
206
+ #define vec_add_dpbusd_32 (acc, a, b ) \
207
+ Simd::dotprod_m128_add_dpbusd_epi32 (acc, vreinterpretq_s8_s32 (a), \
208
+ vreinterpretq_s8_s32 (b))
216
209
#endif
217
210
218
211
static constexpr IndexType OutputSimdWidth = sizeof (vec_t ) / sizeof (OutputType);
@@ -242,28 +235,33 @@ class AffineTransform {
242
235
for (IndexType k = 0 ; k < NumRegs; ++k)
243
236
outptr[k] = acc[k];
244
237
245
- #undef vec_setzero
246
238
#undef vec_set_32
247
239
#undef vec_add_dpbusd_32
248
- #undef vec_hadd
249
240
}
250
241
else if constexpr (OutputDimensions == 1 )
251
242
{
252
-
253
243
// We cannot use AVX512 for the last layer because there are only 32 inputs
254
244
// and the buffer is not padded to 64 elements.
255
245
#if defined(USE_AVX2)
256
246
using vec_t = __m256i;
257
- #define vec_setzero _mm256_setzero_si256
247
+ #define vec_setzero () _mm256_setzero_si256()
258
248
#define vec_set_32 _mm256_set1_epi32
259
249
#define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32
260
250
#define vec_hadd Simd::m256_hadd
261
251
#elif defined(USE_SSSE3)
262
252
using vec_t = __m128i;
263
- #define vec_setzero _mm_setzero_si128
253
+ #define vec_setzero () _mm_setzero_si128()
264
254
#define vec_set_32 _mm_set1_epi32
265
255
#define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32
266
256
#define vec_hadd Simd::m128_hadd
257
+ #elif defined(USE_NEON_DOTPROD)
258
+ using vec_t = int32x4_t ;
259
+ #define vec_setzero () vdupq_n_s32(0 )
260
+ #define vec_set_32 vdupq_n_s32
261
+ #define vec_add_dpbusd_32 (acc, a, b ) \
262
+ Simd::dotprod_m128_add_dpbusd_epi32 (acc, vreinterpretq_s8_s32 (a), \
263
+ vreinterpretq_s8_s32 (b))
264
+ #define vec_hadd Simd::neon_m128_hadd
267
265
#endif
268
266
269
267
const auto inputVector = reinterpret_cast <const vec_t *>(input);
0 commit comments