Skip to content

Commit c930c4d

Browse files
authored
Add first NEON SIMD opcode implementations to fast interpreter (#3859)
Add some implementations of SIMD opcodes using NEON instructions. Tested using: ```wast (module (import "wasi_snapshot_preview1" "proc_exit" (func $proc_exit (param i32))) (memory (export "memory") 1) (func $assert_true (param v128) local.get 0 v128.any_true i32.eqz if unreachable end ) (func $main (export "_start") i32.const 0 i32.const 32 memory.grow drop i32.const 0 v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 v128.store i32.const 0 v128.load v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 i8x16.eq call $assert_true i32.const 16 v128.const i8x16 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 v128.store i32.const 16 v128.load v128.const i8x16 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 i8x16.eq call $assert_true i32.const 0 v128.load v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 i8x16.eq call $assert_true drop i32.const 0 i32.const 1 memory.grow drop i32.const 0 i64.const 0x7F80FF017E02FE80 i64.store i32.const 0 v128.load8x8_s v128.const i16x8 127 -128 -1 1 126 2 -2 -128 i16x8.eq call $assert_true i32.const 0 i64.const 0x80FE027E01FF807F i64.store i32.const 0 v128.load8x8_u v128.const i16x8 128 254 2 126 1 255 128 127 i16x8.eq call $assert_true i32.const 0 i64.const 0x8000FFFE7FFF0001 i64.store i32.const 0 v128.load16x4_s v128.const i32x4 -32768 -2 32767 1 i32x4.eq call $assert_true i32.const 0 i64.const 0x8000FFFE7FFF0001 i64.store i32.const 0 v128.load16x4_u v128.const i32x4 32768 65534 32767 1 i32x4.eq call $assert_true i32.const 0 i64.const 0x8000000000000001 i64.store i32.const 0 v128.load32x2_s v128.const i64x2 -2147483648 1 i64x2.eq call $assert_true i32.const 0 i64.const 0x8000000000000001 i64.store i32.const 0 v128.load32x2_u v128.const i64x2 2147483648 1 i64x2.eq call $assert_true call $proc_exit ) ) ```
1 parent aceaed6 commit c930c4d

File tree

8 files changed

+1128
-47
lines changed

8 files changed

+1128
-47
lines changed

build-scripts/config_common.cmake

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,9 @@ endif ()
290290
if (WAMR_BUILD_LIB_RATS EQUAL 1)
291291
message (" Lib rats enabled")
292292
endif()
293+
if ((WAMR_BUILD_LIB_SIMDE EQUAL 1))
294+
message (" Lib simde enabled")
295+
endif()
293296
if (WAMR_BUILD_MINI_LOADER EQUAL 1)
294297
add_definitions (-DWASM_ENABLE_MINI_LOADER=1)
295298
message (" WASM mini loader enabled")

build-scripts/runtime_lib.cmake

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,10 @@ if (WAMR_BUILD_LIB_RATS EQUAL 1)
155155
include (${IWASM_DIR}/libraries/lib-rats/lib_rats.cmake)
156156
endif ()
157157

158+
if (WAMR_BUILD_LIB_SIMDE EQUAL 1)
159+
include (${IWASM_DIR}/libraries/simde/simde.cmake)
160+
endif ()
161+
158162
if (WAMR_BUILD_WASM_CACHE EQUAL 1)
159163
include (${WAMR_ROOT_DIR}/build-scripts/involve_boringssl.cmake)
160164
endif ()

core/config.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,12 @@
318318
#define WASM_ENABLE_SIMD 0
319319
#endif
320320

321+
/* Disable SIMDe (used in the fast interpreter for SIMD opcodes)
322+
unless used elsewhere */
323+
#ifndef WASM_ENABLE_SIMDE
324+
#define WASM_ENABLE_SIMDE 0
325+
#endif
326+
321327
/* GC performance profiling */
322328
#ifndef WASM_ENABLE_GC_PERF_PROFILING
323329
#define WASM_ENABLE_GC_PERF_PROFILING 0

core/iwasm/common/wasm_runtime_common.h

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,13 +73,20 @@ STORE_U8(void *addr, uint8_t value)
7373
*(uint8 *)addr = value;
7474
}
7575

76+
static inline void
77+
STORE_V128(void *addr, V128 value)
78+
{
79+
*(V128 *)addr = value;
80+
}
81+
7682
/* For LOAD opcodes */
7783
#define LOAD_I64(addr) (*(int64 *)(addr))
7884
#define LOAD_F64(addr) (*(float64 *)(addr))
7985
#define LOAD_I32(addr) (*(int32 *)(addr))
8086
#define LOAD_U32(addr) (*(uint32 *)(addr))
8187
#define LOAD_I16(addr) (*(int16 *)(addr))
8288
#define LOAD_U16(addr) (*(uint16 *)(addr))
89+
#define LOAD_V128(addr) (*(V128 *)(addr))
8390

8491
#define STORE_PTR(addr, ptr) \
8592
do { \
@@ -264,7 +271,67 @@ STORE_U16(void *addr, uint16_t value)
264271
((uint8_t *)(addr))[0] = u.u8[0];
265272
((uint8_t *)(addr))[1] = u.u8[1];
266273
}
274+
275+
static inline void
276+
STORE_V128(void *addr, V128 value)
277+
{
278+
uintptr_t addr_ = (uintptr_t)(addr);
279+
union {
280+
V128 val;
281+
uint64 u64[2];
282+
uint32 u32[4];
283+
uint16 u16[8];
284+
uint8 u8[16];
285+
} u;
286+
287+
if ((addr_ & (uintptr_t)15) == 0) {
288+
*(V128 *)addr = value;
289+
}
290+
else {
291+
u.val = value;
292+
if ((addr_ & (uintptr_t)7) == 0) {
293+
((uint64 *)(addr))[0] = u.u64[0];
294+
((uint64 *)(addr))[1] = u.u64[1];
295+
}
296+
else {
297+
bh_assert((addr_ & (uintptr_t)3) == 0);
298+
((uint32 *)addr)[0] = u.u32[0];
299+
((uint32 *)addr)[1] = u.u32[1];
300+
((uint32 *)addr)[2] = u.u32[2];
301+
((uint32 *)addr)[3] = u.u32[3];
302+
}
303+
}
304+
}
305+
267306
/* For LOAD opcodes */
307+
static inline V128
308+
LOAD_V128(void *addr)
309+
{
310+
uintptr_t addr1 = (uintptr_t)addr;
311+
union {
312+
V128 val;
313+
uint64 u64[2];
314+
uint32 u32[4];
315+
uint16 u16[8];
316+
uint8 u8[16];
317+
} u;
318+
if ((addr1 & (uintptr_t)15) == 0)
319+
return *(V128 *)addr;
320+
321+
if ((addr1 & (uintptr_t)7) == 0) {
322+
u.u64[0] = ((uint64 *)addr)[0];
323+
u.u64[1] = ((uint64 *)addr)[1];
324+
}
325+
else {
326+
bh_assert((addr1 & (uintptr_t)3) == 0);
327+
u.u32[0] = ((uint32 *)addr)[0];
328+
u.u32[1] = ((uint32 *)addr)[1];
329+
u.u32[2] = ((uint32 *)addr)[2];
330+
u.u32[3] = ((uint32 *)addr)[3];
331+
}
332+
return u.val;
333+
}
334+
268335
static inline int64
269336
LOAD_I64(void *addr)
270337
{

0 commit comments

Comments
 (0)