diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s index 024a994020..a30816cee9 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s index 56e93fbbef..6a1f817ff1 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s index d86f73aa09..b2f05567a5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s index 49b5bb26a8..1bcaa28e98 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s index 1724b70b33..e5b0db7445 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s index a1ce66bb99..dea6628e6c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s index 143a7b9da2..b0d9b31d0f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -364,7 +364,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_ ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s index a2c8049020..b018428c88 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -364,7 +364,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x32x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x32x1x1_1x2x1x128.s index 2542fe2ca3..4b8e6cbf94 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x32x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x32x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -396,7 +396,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_ ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s index 1cafc13d81..4f4e109f62 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -342,7 +342,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s index cfb95f7a4a..4fbe872341 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -340,7 +340,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s index 0d285d546e..bf82d7d4ab 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -342,7 +342,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s index 43df9bdf44..76a640ae05 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -340,7 +340,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x2x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x2x1x1_1x8x1x16.s index 5ded5f750c..ad77b45fd8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x2x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x2x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x16x8_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x2x1x64_tb1x1x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x16x8_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x2x1x64_tb1x1x1x1_1x8x1x16.s index 6294340e55..996c32b1c2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x16x8_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x2x1x64_tb1x1x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x16x8_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x2x1x64_tb1x1x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -334,7 +334,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x16x8_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s index 5f217edc5f..6d4089f781 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -363,7 +363,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s index ddc6cfe03f..eee7b71191 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -366,7 +366,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s index ea138a64c2..4697d59cf2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -363,7 +363,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s index 8b5e169c7f..46d435d907 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -366,7 +366,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x32x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x32x1x1_1x1x1x256.s index cf7f3ca433..89d84dde54 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x32x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x32x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -395,7 +395,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x8x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x8x1_1x8x1x32.s index 77bda69295..7c24f46443 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x8x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x8x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x8x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x8x1x1_1x1x1x256.s index babdda683b..67d4c846e8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x8x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x8x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -347,7 +347,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x2x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x2x1_1x16x1x16.s index 1542ee2116..75eda4891c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x2x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x2x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s index 6fd213bc6b..80f9957d24 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x32x8_wt32x8x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x32x8_wt32x8x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x1x1_1x8x1x32.s index 7d3938ab65..f14e1d46a5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x32x8_wt32x8x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x32x8_wt32x8x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -334,7 +334,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x32x8_wt32x8x4_ws1x1_wr2x2_ta1x2x2x1_1x4x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x4x1_1x16x1x16.s index 801750881b..99aa33ba04 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x4x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -342,7 +342,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s index 335805d084..d2a91b6c34 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -340,7 +340,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s index 0f46806478..48741569e6 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s index 185e97ae31..9a3b868f0e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -364,7 +364,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x2x1_1x8x1x32.s index ea496ad966..b80b691de8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x2x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x2x1x1_1x4x1x64.s index 3a936eabc3..a0f5700af2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x2x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x2x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x16x1x1_1x1x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x16x1x1_1x1x1x128.s index 08e48c8eb7..1d8185db34 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x16x1x1_1x1x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x16x1x1_1x1x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -362,7 +362,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x8x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x8x1_1x8x1x16.s index d5bd2a35d2..96cf2afc2c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x8x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x8x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -347,7 +347,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x8x1x1_1x1x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x8x1x1_1x1x1x128.s index 2256007c93..174fb3925c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x8x1x1_1x1x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x8x1x1_1x1x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -344,7 +344,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x4x1x1_1x4x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x4x1x1_1x4x1x16.s index 5d39e2af2a..cd15ed12d4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x4x1x1_1x4x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x4x1x1_1x4x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -340,7 +340,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x2x1x1_1x4x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x2x1x1_1x4x1x16.s index f4488af505..07593e327d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x2x1x1_1x4x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x2x1x1_1x4x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -336,7 +336,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s index d717603f73..607dd1f5ce 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -360,7 +360,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s index dff9926e37..fe5e4649d4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -363,7 +363,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x4x1x1_1x4x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x4x1x1_1x4x1x32.s index cde628a42f..3f3bfa08c4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x4x1x1_1x4x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x4x1x1_1x4x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x8x1x1_1x2x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x8x1x1_1x2x1x32.s index 675c5dec92..f34bbc21eb 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x8x1x1_1x2x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x8x1x1_1x2x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x2x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x2x1_1x8x1x16.s index 6b80515a75..b92ac8a73b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x2x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x2x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -336,7 +336,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x2x1x1_1x4x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x2x1x1_1x4x1x32.s index 109477b2bc..967f978509 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x2x1x1_1x4x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x2x1x1_1x4x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -334,7 +334,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x4x1x1_1x2x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x4x1x1_1x2x1x32.s index bb9950149b..f67e485a01 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x4x1x1_1x2x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x4x1x1_1x2x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s index df7f278576..022e72efe8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s index 1352756cbc..f56b39d6bd 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s index 6de27e894b..20ebe6ec7a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -347,7 +347,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x4x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x4x1_1x8x1x16.s index 9820f35b0a..c22f9b4b75 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x4x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x4x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x4x1x1_1x2x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x4x1x1_1x2x1x64.s index f1507d992b..58601a1997 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x4x1x1_1x2x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x4x1x1_1x2x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s index 4d23937b40..00a8baa6c2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s index 0891f4a473..efa733f7fd 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s index 61be43069a..9f6ab10002 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s index 8695a35634..0ebb58022d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s index d6b0fd92cf..2c7aa919d3 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -364,7 +364,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x4x1_1x8x1x32.s index aa52abb160..3f9e12dc55 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -342,7 +342,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s index 922e504dc8..dd2674ff55 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -340,7 +340,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x1x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x1x1_1x16x1x16.s index abe8d5e546..5858526097 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x1x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x1x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -334,7 +334,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x8x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x2x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x2x1_1x16x1x16.s index e6b5cdb9f2..2796794037 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x2x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x2x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s index 65a1d6c5bb..6c0df8b80c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x32x8_wt64x4x4_ws1x2_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x32x8_wt64x4x4_ws1x2_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x1x1_1x8x1x32.s index 7dc8ba72b3..54f7c611b0 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x32x8_wt64x4x4_ws1x2_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x32x8_wt64x4x4_ws1x2_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -334,7 +334,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x32x8_wt64x4x4_ws1x2_wr2x2_ta1x4x2x1_1x2x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x4x1_1x16x1x16.s index 8d44165ded..48c83560d8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x4x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -342,7 +342,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x4x1x1_1x4x1x64.s index 920c94be36..31bb728d41 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -340,7 +340,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x2x1_1x8x1x32.s index e8f9507e04..275237898d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x2x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x2x1x1_1x4x1x64.s index b5b21d4dd4..a8f759dba1 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x2x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x2x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s index 66c1444bb1..092ab54677 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -349,7 +349,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s index b51dfa3a54..d027fe337d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -347,7 +347,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s index 8fb8f1cffe..d04f882901 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -349,7 +349,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s index 54a319e660..5f3f3ae780 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -347,7 +347,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s index b7025a6098..90fc2521c6 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s index eb5dfdc2ca..aad7c1d698 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s index ae0c5d04d4..aa5f8b9d2b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s index 877538b7bc..a4caf1cde5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x2x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x2x1x1_1x8x1x16.s index 400641b16b..e63d1649b4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x2x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x2x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x4x1x1_1x4x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x4x1x1_1x4x1x16.s index df8f380449..242dafef1f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x4x1x1_1x4x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x4x1x1_1x4x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -340,7 +340,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x16x8_wt32x8x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x16_tb1x1x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x16x8_wt32x8x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x16_tb1x1x1x1_1x8x1x16.s index 2446f830db..a322a9f7f4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x16x8_wt32x8x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x16_tb1x1x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x16x8_wt32x8x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x16_tb1x1x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -333,7 +333,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x16x8_wt32x8x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x2x1x1_1x4x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x2x1x1_1x4x1x16.s index 8ecc453b9c..b40835db6e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x2x1x1_1x4x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x2x1x1_1x4x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s index f3195406b1..2b90a4833b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -362,7 +362,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s index ebbb90e28d..acb40502de 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -365,7 +365,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s index e0b26d24d5..b70d566ba4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -347,7 +347,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s index ab2306b39e..a3e10e10c5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -344,7 +344,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x2x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x2x1_1x16x1x16.s index 852d49ee2e..74f5d833ac 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x2x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x2x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -338,7 +338,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x2x1x1_1x8x1x32.s index e66c93eda9..71e5c11721 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x2x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -336,7 +336,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x4x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x4x1x1_1x8x1x32.s index 51f85b45be..0bd113e89d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x4x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x4x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -340,7 +340,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x8x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x8x1x1_1x8x1x32.s index 311cad82b1..77229e5b9e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x8x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x8x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s index 0d59ba3430..9e02d3216d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -331,7 +331,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s index d0aab48eb2..0ae2c66ba2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -341,7 +341,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s index 3b454dd120..7b60c8b5a4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s index 62b1d5193f..3a866a3cdb 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -336,7 +336,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s index dfb3bd4c97..918694d571 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -334,7 +334,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x16x1x1_1x1x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x16x1x1_1x1x1x64.s index 196d648782..f319731422 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x16x1x1_1x1x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x16x1x1_1x1x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -360,7 +360,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s index 3e4de4a354..795dd72c94 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s index 6b2886b518..3e4ecc9e7a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s index a7e5b4a771..3008d5ce3e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s index b23d753ef9..5d93347269 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s index bf06324cd2..c86a88f870 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s index e5d2dbe7bc..2b99482e5f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s index c455e78625..5bdcecead8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s index 3c4786e042..4083d4975b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x16x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x16x1x1_1x2x1x128.s index 0d4135cbd5..2e8a6f3eb8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x16x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x16x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -364,7 +364,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s index 3c43c9332b..9177bd7e2f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -341,7 +341,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s index 0561d8f8e2..035c44bd71 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s index 2581cb42e8..94fb2dbf7f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -341,7 +341,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s index 9f390cf578..d8a6529ada 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s index 1478cf0500..bf017680f4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -341,7 +341,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s index d81edeef25..afbc0c21b2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x1x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x1x1_1x16x1x16.s index a8226bdbe1..4efb7a99a1 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x1x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x1x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -334,7 +334,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s index 560cc3716a..9ec63c4516 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x16x8_wt64x4x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x32_tb1x1x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x16x8_wt64x4x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x32_tb1x1x1x1_1x8x1x16.s index 4c34e6bb7d..79a4e5a032 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x16x8_wt64x4x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x32_tb1x1x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x16x8_wt64x4x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x32_tb1x1x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -334,7 +334,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x16x8_wt64x4x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s index 7b02c38069..ce68f2982a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -363,7 +363,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s index d47aa7a8b9..178e72e1e2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -366,7 +366,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s index b63a049184..4c8747680e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -363,7 +363,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s index c9718e29a4..bf4fc47751 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -366,7 +366,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s index 768bfd9f3e..57dd1de115 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -363,7 +363,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s index 4f66ad890f..56a2e1ffb8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -366,7 +366,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x32x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x32x1x1_1x1x1x256.s index 881a6d8b31..ab3e208a91 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x32x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x32x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -395,7 +395,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s index 5cd01c99ed..5c2de273c3 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -349,7 +349,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s index 4bee632159..0b4173770c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -346,7 +346,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s index 07b0314e07..34b5ab5bef 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -349,7 +349,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s index e06dff823d..ee674d9fb4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -346,7 +346,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x2x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x2x1_1x16x1x16.s index cbf9369655..3df51fc061 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x2x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x2x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s index f01ddc90c8..127a57523e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s index 405ff89c98..f051b961ab 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -333,7 +333,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x4x1_1x16x1x16.s index 1ab24a4e62..2e525fef9a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x4x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -342,7 +342,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x4x1x1_1x4x1x64.s index eef17d945a..206ebac03a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -340,7 +340,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s index 2bbb09e3d6..07b5ab14b5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s index 34c5523fc8..e4f4cadb0c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s index eeb9a935c7..96acbe5a34 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -364,7 +364,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s index 04e93b7f99..b075538251 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -364,7 +364,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s index f0d85b843f..6ab5af70a8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -338,7 +338,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s index 32d6309d85..484c537974 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -336,7 +336,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s index 8a5053d692..91ebfcf76d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -345,7 +345,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex0_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s index 38592fb2b3..6441f74eea 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s index 7694bde24c..0d3aa8416a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s index f359daa264..cb35e9df44 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s index 594b0cb24e..fe304cbadf 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s index e7a237c06b..d9024528dd 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s index c80234c0c8..ea571b6b34 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s index f0d85c946b..cbf7267c3c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -414,7 +414,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_ ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s index 5493851bf7..1335ab460f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -414,7 +414,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x32x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x32x1x1_1x2x1x128.s index dc93a75aa7..1c56d1b434 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x32x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x32x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -446,7 +446,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_ ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s index 27d98fdef8..7ef4e2ca66 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -392,7 +392,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s index 71966da65e..58922b4b20 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -390,7 +390,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s index 6158c246ec..6512a17cc3 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -392,7 +392,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s index a95fb19718..fa23a6071e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -390,7 +390,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x2x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x2x1x1_1x8x1x16.s index 6b1ec53c91..d84915d69b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x2x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x2x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x16x8_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x2x1x64_tb1x1x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x16x8_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x2x1x64_tb1x1x1x1_1x8x1x16.s index 3cc742785d..7a07bdbc0e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x16x8_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x2x1x64_tb1x1x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x16x8_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x2x1x64_tb1x1x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -384,7 +384,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x16x8_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s index 04b4048fae..853dae6d0d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -413,7 +413,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s index 5a11412030..397ef9e7de 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -416,7 +416,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s index 2403765f49..26ce5064cb 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -413,7 +413,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s index 04310f1da3..20c253eab1 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -416,7 +416,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x32x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x32x1x1_1x1x1x256.s index 6bad524293..487c1975ca 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x32x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x32x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -445,7 +445,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x8x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x8x1_1x8x1x32.s index b0a9d38508..ac9c4550de 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x8x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x8x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x8x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x8x1x1_1x1x1x256.s index 03a8a27d5f..6a4691cd92 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x8x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x8x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -397,7 +397,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x2x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x2x1_1x16x1x16.s index 7eef201f3e..ed74a35957 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x2x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x2x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s index cda7cea3c2..8c08c3d7bd 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x32x8_wt32x8x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x32x8_wt32x8x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x1x1_1x8x1x32.s index fc09be30a5..f30ad8a151 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x32x8_wt32x8x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x32x8_wt32x8x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -384,7 +384,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x32x8_wt32x8x4_ws1x1_wr2x2_ta1x2x2x1_1x4x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x4x1_1x16x1x16.s index 85df04090a..436cd2d1bc 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x4x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -392,7 +392,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s index 210e635f18..f6d313fb54 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -390,7 +390,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s index 338fd9c1ba..744f07d086 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s index 34874a915b..e80574496f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -414,7 +414,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x2x1_1x8x1x32.s index 5f45063552..5a9158f22d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x2x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x2x1x1_1x4x1x64.s index 1b9f4e632f..57c7420d41 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x2x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x2x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x16x1x1_1x1x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x16x1x1_1x1x1x128.s index d02c0fdff0..3b764a274e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x16x1x1_1x1x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x16x1x1_1x1x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -412,7 +412,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x8x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x8x1_1x8x1x16.s index 99c9969c63..9e4f169b96 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x8x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x8x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -397,7 +397,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x8x1x1_1x1x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x8x1x1_1x1x1x128.s index b4c0494580..c27e69964a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x8x1x1_1x1x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x8x1x1_1x1x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -394,7 +394,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x4x1x1_1x4x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x4x1x1_1x4x1x16.s index 802217fe11..09ddb8edba 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x4x1x1_1x4x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x4x1x1_1x4x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -390,7 +390,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x2x1x1_1x4x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x2x1x1_1x4x1x16.s index 618b603cdf..57d421f70e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x2x1x1_1x4x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x2x1x1_1x4x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -386,7 +386,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s index 1305c9d699..14e46d3fe2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -410,7 +410,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s index e7329cc8b6..f6b37bc2a0 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -413,7 +413,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x4x1x1_1x4x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x4x1x1_1x4x1x32.s index dbae38c8a1..f53f246736 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x4x1x1_1x4x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x4x1x1_1x4x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x8x1x1_1x2x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x8x1x1_1x2x1x32.s index 139f5c831e..4f66e778aa 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x8x1x1_1x2x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x8x1x1_1x2x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x2x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x2x1_1x8x1x16.s index e74e767206..60969d5ab8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x2x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x2x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -386,7 +386,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x2x1x1_1x4x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x2x1x1_1x4x1x32.s index fbe89556aa..435c44cbcd 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x2x1x1_1x4x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x2x1x1_1x4x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -384,7 +384,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x4x1x1_1x2x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x4x1x1_1x2x1x32.s index 412d077baf..414ced7af5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x4x1x1_1x2x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x4x1x1_1x2x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s index d0c338ad30..3eb23bcac8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s index fc7e384ba5..9f162124d6 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s index e92096a19e..e87a193782 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -397,7 +397,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x4x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x4x1_1x8x1x16.s index 9ca62e64e9..1a456aa458 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x4x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x4x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x4x1x1_1x2x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x4x1x1_1x2x1x64.s index af9e1d52b2..f514b44afc 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x4x1x1_1x2x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x4x1x1_1x2x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s index be70e36ef2..560c4efae4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s index 13108fabd9..9cac95f11d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s index 40525c1604..2738218ede 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s index 4c28785537..702b861002 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s index 127264ac6b..3d7f0c00a6 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -414,7 +414,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x4x1_1x8x1x32.s index ac4a3a42aa..21fc076baf 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -392,7 +392,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s index 00f43ed934..3832bfd070 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -390,7 +390,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x1x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x1x1_1x16x1x16.s index d3358f9048..f9eee44829 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x1x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x1x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -384,7 +384,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x8x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x2x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x2x1_1x16x1x16.s index aaaa5ddbf3..974c6295cf 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x2x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x2x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s index c35853c5ff..998b7473f6 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x32x8_wt64x4x4_ws1x2_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x32x8_wt64x4x4_ws1x2_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x1x1_1x8x1x32.s index f0ba36ebdd..d1d09c77ed 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x32x8_wt64x4x4_ws1x2_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x32x8_wt64x4x4_ws1x2_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -384,7 +384,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x32x8_wt64x4x4_ws1x2_wr2x2_ta1x4x2x1_1x2x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x4x1_1x16x1x16.s index ea0aa531e5..502403981e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x4x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -392,7 +392,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x4x1x1_1x4x1x64.s index 0b247ad481..814b727c40 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -390,7 +390,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x2x1_1x8x1x32.s index 82f61a6a2e..eda497987a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x2x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x2x1x1_1x4x1x64.s index bbec5779c1..c4f5efb286 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x2x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x2x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s index a42badaf01..748b92641d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -399,7 +399,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s index 08c672c09b..08b16e56a4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -397,7 +397,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s index e8aca30286..17fd3b2214 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -399,7 +399,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s index 3b1d0fb72a..05f354a8ce 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -397,7 +397,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s index 6e637b1dc5..39a670c96a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s index 0c4af25805..a49fde3856 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s index 7bf5b0dff6..6e12f9683a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s index 8a0c106680..4c45cb9f22 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x2x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x2x1x1_1x8x1x16.s index ce80a182b8..c2767cc42e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x2x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x2x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x4x1x1_1x4x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x4x1x1_1x4x1x16.s index 70c2b10fa9..1003b4b9d9 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x4x1x1_1x4x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x4x1x1_1x4x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -390,7 +390,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x16x8_wt32x8x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x16_tb1x1x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x16x8_wt32x8x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x16_tb1x1x1x1_1x8x1x16.s index fbacf8b92f..0048981063 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x16x8_wt32x8x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x16_tb1x1x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x16x8_wt32x8x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x16_tb1x1x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -383,7 +383,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x16x8_wt32x8x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x2x1x1_1x4x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x2x1x1_1x4x1x16.s index 903474f5c2..d8f7d8c8f3 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x2x1x1_1x4x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x2x1x1_1x4x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s index 1e68787e2d..fdccbde23e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -412,7 +412,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s index 20d9d94cb3..50f45d77a8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -415,7 +415,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s index 42af960308..f2e4276a32 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -397,7 +397,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s index 9f331a56b6..1d249cac80 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -394,7 +394,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x2x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x2x1_1x16x1x16.s index 39eaeea497..8eaf0ee77b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x2x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x2x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -388,7 +388,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x2x1x1_1x8x1x32.s index 11587985c3..46a7a8c96e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x2x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -386,7 +386,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x4x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x4x1x1_1x8x1x32.s index ec96bdf006..749a8abd61 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x4x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x4x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -390,7 +390,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x8x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x8x1x1_1x8x1x32.s index e0b5c2ed91..57ecefaf5f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x8x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x8x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s index f0cb54d0bd..71a3221ac3 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -381,7 +381,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s index bd14f38e35..a9d8e2ab61 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -391,7 +391,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s index 958dd179a2..8a3d64a416 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s index 440bc34294..35cbb4c879 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -386,7 +386,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s index fa4f86c672..3215c8045c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -384,7 +384,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x16x1x1_1x1x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x16x1x1_1x1x1x64.s index ac2eb3bf56..1fb9eec570 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x16x1x1_1x1x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x16x1x1_1x1x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -410,7 +410,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s index 03e42495d2..848e362387 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s index f16a3ef7b3..8cb16a8944 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s index 5715f0ed5a..d035a371e7 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s index 4b60bf448a..c59b758ee5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s index 157fad5f5b..ef8cf9b178 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s index 5fb7d3e5c5..22f5800e2d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s index a9fe0fb04b..59c67a82aa 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s index 8054fb5532..92ef46508c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x16x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x16x1x1_1x2x1x128.s index 661f600c82..11b89cf5da 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x16x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x16x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -414,7 +414,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s index 7091334748..1291e25d41 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -391,7 +391,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s index 8a8bf15b02..ceb27032b9 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s index e9e12408b6..d7e577b602 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -391,7 +391,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s index 5e792d77a9..6fd842e67a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s index 94218f1cfe..eff6ed8d63 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -391,7 +391,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s index 72cbc345df..3947107508 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x1x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x1x1_1x16x1x16.s index c23a070902..29cf297c98 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x1x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x1x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -384,7 +384,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s index 55bd5ba874..e9cd2d64d0 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x16x8_wt64x4x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x32_tb1x1x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x16x8_wt64x4x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x32_tb1x1x1x1_1x8x1x16.s index 82e68d45e8..e8c890bfec 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x16x8_wt64x4x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x32_tb1x1x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x16x8_wt64x4x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x32_tb1x1x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -384,7 +384,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x16x8_wt64x4x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s index aa6af3f222..0f64fe3a91 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -413,7 +413,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s index f4f277777c..d7031410e5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -416,7 +416,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s index 4ea7a3b46f..63f08769ac 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -413,7 +413,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s index 1a2aed153e..79e353faa6 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -416,7 +416,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s index 5503ebec6e..1931cffcdc 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -413,7 +413,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s index 664b23bd94..0bdfa8f5a1 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -416,7 +416,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x32x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x32x1x1_1x1x1x256.s index 68505b1cf5..d476df2695 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x32x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x32x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -445,7 +445,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s index 784393b27f..9394eae566 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -399,7 +399,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s index e8f7661609..21f6b1808a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -396,7 +396,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s index c005763496..15e0d26554 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -399,7 +399,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s index 317e5ce9da..a60d520b84 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -396,7 +396,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x2x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x2x1_1x16x1x16.s index 16f289f4d8..2337574349 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x2x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x2x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s index 3849ab1a6f..c8bb1ce109 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s index eafe24d5c1..6428e58dfe 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -383,7 +383,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x4x1_1x16x1x16.s index 59da066c91..136406e871 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x4x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -392,7 +392,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x4x1x1_1x4x1x64.s index 546426a06e..440d477267 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -390,7 +390,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s index 524bcecb67..b783e7df82 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s index 3ae68067ee..7ae4e3c074 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s index 0f3189df29..a0c52e66cd 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -414,7 +414,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s index 4308da972a..3d7f919240 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -414,7 +414,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s index e4956db0d7..24b444121a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -388,7 +388,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s index a223a6897b..e643b331eb 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -386,7 +386,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s index d71c199ab7..24fc79d1d7 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -395,7 +395,6 @@ igemm_fwd_gtcx_nchw_fp16_bx16_ex1_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s index 621fe14936..52b141d438 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s index 0e3963d3fc..76aca01a7f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s index 94c02e09a4..a830007580 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s index d0a79b73a3..a06138f283 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s index f9945ebd6b..2907a5182d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s index 9bb0095582..1a1e23b1c7 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s index b8e3b22741..7aad89fc00 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -364,7 +364,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s index b4c9a084cc..de9ead04de 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -366,7 +366,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s index ec6dd46e67..f95673b43b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -364,7 +364,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s index 4d9d33a672..2ba27ecfc5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -366,7 +366,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x1x32x1_1x64x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x1x32x1_1x64x1x4.s index ec7b0ba1a0..fbe5f0a843 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x1x32x1_1x64x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x1x32x1_1x64x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x32x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x32x1x1_1x2x1x128.s index 89124bb099..87f2f3da15 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x32x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x32x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -396,7 +396,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s index 1237e28829..4834af561d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -342,7 +342,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s index 887452c618..64ff0129d8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -340,7 +340,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s index 2a1e0a1658..64e5edab18 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -342,7 +342,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s index ee72838b0a..b856ebd772 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -340,7 +340,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x1x2x1_1x16x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x1x2x1_1x16x1x8.s index 89b1bc7c7d..713393545e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x1x2x1_1x16x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x1x2x1_1x16x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x2x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x2x1x1_1x8x1x16.s index 60510154eb..00332720f1 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x2x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x2x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x16x8_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x2x1x64_tb1x1x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x16x8_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x2x1x64_tb1x1x1x1_1x8x1x16.s index db7b1a1b79..8b2c1e160a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x16x8_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x2x1x64_tb1x1x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x16x8_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x2x1x64_tb1x1x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -334,7 +334,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x16x8_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x2x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s index e4940f7835..e7ebb27497 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -363,7 +363,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s index d4a78f8aef..6eff866cd8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -366,7 +366,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s index d418713fc0..fb1238101e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -363,7 +363,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s index 5632b94de9..8856ecdfe5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -366,7 +366,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x32x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x32x1_1x32x1x8.s index 5eed892d60..bdf2806c00 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x32x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x32x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x32x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x32x1x1_1x1x1x256.s index 6f5c2de6b5..48cbaa06e6 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x32x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x32x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -395,7 +395,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x8x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x8x1_1x8x1x32.s index fbe664690c..7ef7dd3a08 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x8x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x8x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x8x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x8x1x1_1x1x1x256.s index eb219e3995..3b78398f51 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x8x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x8x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -347,7 +347,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x2x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x2x1_1x16x1x16.s index d06d062379..ae7da053ce 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x2x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x2x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s index 7652021ba1..fe2f63e3e1 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x32x8_wt32x8x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x32x8_wt32x8x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x1x1_1x8x1x32.s index e63748ee73..0c6bbf38f1 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x32x8_wt32x8x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x32x8_wt32x8x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -334,7 +334,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x32x8_wt32x8x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x4x1_1x16x1x16.s index d47c10d126..ddd1c79456 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x4x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -342,7 +342,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s index b9ed3030bb..f221f87a36 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -340,7 +340,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x8x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x8x1_1x32x1x8.s index 5e1a5abf21..2f860b0131 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x8x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x8x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s index eb371c5875..006405c0b7 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s index 4ae7c63289..b1bb2f1fcd 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -364,7 +364,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s index bba61b0af8..c810177d76 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -366,7 +366,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x2x1_1x8x1x32.s index 722cd8ad27..1ac5042ca7 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x2x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x2x1x1_1x4x1x64.s index b27b5dd81a..223294f3eb 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x2x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x2x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x16x1x1_1x1x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x16x1x1_1x1x1x128.s index 2437beb76c..1072ffa429 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x16x1x1_1x1x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x16x1x1_1x1x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -362,7 +362,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x16x1_1x16x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x16x1_1x16x1x8.s index bff4efd7ca..57d4f101b5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x16x1_1x16x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x16x1_1x16x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -365,7 +365,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x8x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x8x1_1x8x1x16.s index 63d0c4e5f2..17e163dafe 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x8x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x8x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -347,7 +347,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x8x1x1_1x1x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x8x1x1_1x1x1x128.s index e607efc455..77f538cea5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x8x1x1_1x1x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x8x1x1_1x1x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -344,7 +344,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x1x4x1_1x16x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x1x4x1_1x16x1x4.s index 33f0bc19ef..b413ab52aa 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x1x4x1_1x16x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x1x4x1_1x16x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -342,7 +342,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x4x1x1_1x4x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x4x1x1_1x4x1x16.s index 546d232f78..81d270a7e1 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x4x1x1_1x4x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x4x1x1_1x4x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -340,7 +340,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x1x2x1_1x8x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x1x2x1_1x8x1x8.s index c1b6b0dc32..2c21848000 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x1x2x1_1x8x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x1x2x1_1x8x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -338,7 +338,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x2x1x1_1x4x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x2x1x1_1x4x1x16.s index bc19418c55..039b734b96 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x2x1x1_1x4x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x2x1x1_1x4x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -336,7 +336,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s index e0191471bb..26a5be5e60 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -360,7 +360,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s index 524aef7108..7cb08a6aa4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -363,7 +363,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x4x1_1x16x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x4x1_1x16x1x8.s index f9da09129f..21296a7135 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x4x1_1x16x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x4x1_1x16x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -341,7 +341,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x4x1x1_1x4x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x4x1x1_1x4x1x32.s index 49b6bfc7b6..6cc300e06c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x4x1x1_1x4x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x4x1x1_1x4x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x1x8x1_1x16x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x1x8x1_1x16x1x4.s index 304305778d..8c7b8b48ac 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x1x8x1_1x16x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x1x8x1_1x16x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x8x1x1_1x2x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x8x1x1_1x2x1x32.s index 7e3c1ae38c..fe94606fa1 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x8x1x1_1x2x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x8x1x1_1x2x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x2x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x2x1_1x8x1x16.s index 7aaef71b70..92821f378e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x2x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x2x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -336,7 +336,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x2x1x1_1x4x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x2x1x1_1x4x1x32.s index eac3d26485..d521205515 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x2x1x1_1x4x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x2x1x1_1x4x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -334,7 +334,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x1x4x1_1x8x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x1x4x1_1x8x1x8.s index 9c65cb0d22..430b0e8c4f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x1x4x1_1x8x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x1x4x1_1x8x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -341,7 +341,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x4x1x1_1x2x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x4x1x1_1x2x1x32.s index 9d214eadac..959697ac74 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x4x1x1_1x2x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x4x1x1_1x2x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s index ab2accb602..f1d97ecbfc 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s index 2b30e600cb..a537d140e4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x8x1_1x16x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x8x1_1x16x1x8.s index 0ad0bded30..7ed088db88 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x8x1_1x16x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x8x1_1x16x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -349,7 +349,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s index 06a16d1738..5b483c54cc 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -347,7 +347,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x4x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x4x1_1x8x1x16.s index 35fcc61ca8..748df7bdea 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x4x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x4x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x4x1x1_1x2x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x4x1x1_1x2x1x64.s index 34c8f7a5a3..176f433606 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x4x1x1_1x2x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x4x1x1_1x2x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s index eb38983d79..bf4bb37f8b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s index 03f8173140..1a785113ce 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s index 88f36c4341..1dd950f833 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s index e046d19ca3..7ee42f31cf 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s index 0d304cebba..0d9e84eda2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -364,7 +364,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s index b95662f585..b3192c102f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -366,7 +366,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x4x1_1x8x1x32.s index edcdbd24d3..9956c61ebe 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -342,7 +342,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s index 1b180f4051..63ed3eacad 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -340,7 +340,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x1x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x1x1_1x16x1x16.s index f36d27a6f0..f9fcdc27a6 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x1x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x1x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -334,7 +334,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x8x2x1_1x2x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x2x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x2x1_1x16x1x16.s index 35bfb8a9b9..47e8446bfc 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x2x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x2x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s index 6606e95496..dbae111cba 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x32x8_wt64x4x4_ws1x2_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x32x8_wt64x4x4_ws1x2_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x1x1_1x8x1x32.s index b030df6ff0..504f9883f0 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x32x8_wt64x4x4_ws1x2_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x32x8_wt64x4x4_ws1x2_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -334,7 +334,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x32x8_wt64x4x4_ws1x2_wr2x2_ta1x4x2x1_1x2x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x4x1_1x16x1x16.s index e8e174966b..b55dfa007f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x4x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -342,7 +342,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x4x1x1_1x4x1x64.s index 7fae862e8e..4c6020f7f1 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -340,7 +340,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x2x1_1x8x1x32.s index b81820b91c..1fd73867d6 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x2x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x2x1x1_1x4x1x64.s index 14a2b80e9a..46f7c0c8a5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x2x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x2x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s index 7bf87c5da2..8b84a7cfc0 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -349,7 +349,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s index a49bae2e06..cd4a5a3e90 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -347,7 +347,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s index 90de164c95..cf83eb5bcb 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -349,7 +349,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s index 47c6bffc8f..bacfa9dc24 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -347,7 +347,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s index ca872b95c8..2d39c38d7d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s index b4844b8e2a..92f8dc1935 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s index 9403bc4ad9..3a9a2356a0 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s index daf3f977b7..201ec5066a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x1x2x1_1x16x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x1x2x1_1x16x1x8.s index 19a070adda..fe531637ba 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x1x2x1_1x16x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x1x2x1_1x16x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x2x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x2x1x1_1x8x1x16.s index 06d2986ae2..b58e80df00 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x2x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x2x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x1x4x1_1x16x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x1x4x1_1x16x1x4.s index cb6e095110..4c9095d740 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x1x4x1_1x16x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x1x4x1_1x16x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -342,7 +342,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x4x1x1_1x4x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x4x1x1_1x4x1x16.s index aa867d1526..fad89abb82 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x4x1x1_1x4x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x4x1x1_1x4x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -340,7 +340,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x8_wt32x8x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x16_tb1x1x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x8_wt32x8x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x16_tb1x1x1x1_1x8x1x16.s index b2d5db0896..92df32af02 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x8_wt32x8x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x16_tb1x1x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x8_wt32x8x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x16_tb1x1x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -333,7 +333,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x8_wt32x8x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x1x2x1_1x8x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x1x2x1_1x8x1x8.s index 2713a6cba5..82c1d7714d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x1x2x1_1x8x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x1x2x1_1x8x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x2x1x1_1x4x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x2x1x1_1x4x1x16.s index 93c58cd717..9ab292d5a5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x2x1x1_1x4x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x2x1x1_1x4x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s index dfd47defa4..b45c9cba4d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -362,7 +362,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s index 797095f259..c57a51993f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -365,7 +365,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s index 47f735315a..477981e44b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -347,7 +347,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s index 0de7ca6230..2987cb0eef 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -344,7 +344,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x2x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x2x1_1x16x1x16.s index 02d653f722..738e73e92d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x2x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x2x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -338,7 +338,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x2x1x1_1x8x1x32.s index c5a4b9f5e2..1030828db1 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x2x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -336,7 +336,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x1x4x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x1x4x1_1x32x1x8.s index fdadd39d58..3d1c2c5e68 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x1x4x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x1x4x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -342,7 +342,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x4x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x4x1x1_1x8x1x32.s index 87e4c070d8..f25df085ad 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x4x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x4x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -340,7 +340,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x1x8x1_1x64x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x1x8x1_1x64x1x4.s index 6d06de73a7..f67e097a72 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x1x8x1_1x64x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x1x8x1_1x64x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x8x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x8x1x1_1x8x1x32.s index 11ffb3fffb..97b07ed00a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x8x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x8x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s index b731c901e6..dc6090a282 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -331,7 +331,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s index 4cd39c3404..bf9c1be00d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -341,7 +341,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s index fae0b223c4..797e3aa1ef 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s index 0918ae4885..b0257ab803 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -336,7 +336,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s index d58e560a06..eb486df1bd 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -334,7 +334,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x16x1x1_1x1x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x16x1x1_1x1x1x64.s index 8d872df6b8..84faec2985 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x16x1x1_1x1x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x16x1x1_1x1x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -360,7 +360,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x1x16x1_1x16x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x1x16x1_1x16x1x4.s index 5231994c28..030141b2c1 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x1x16x1_1x16x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x1x16x1_1x16x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -363,7 +363,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s index 086eccffa5..add5c80555 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s index 8093cf8024..5b2bd06391 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s index a15b358b0b..a0434d43cb 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s index 364cf4b0a8..30cae9ae39 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s index 5f8f7e52bc..dc09518bd0 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s index 93a9004281..c20d73849e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s index 221da4f8cb..19f3c34942 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s index db5af21490..94740d9f51 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x16x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x16x1x1_1x2x1x128.s index 5f63588351..e345d911d5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x16x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x16x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -364,7 +364,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x1x16x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x1x16x1_1x32x1x8.s index 98ba1e7270..ea4f23f149 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x1x16x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x1x16x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -366,7 +366,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s index 019db87a91..0c41e3e255 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -341,7 +341,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s index 9f48509a21..c7ff89d521 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s index 2d8cd09acb..cc3d884e9b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -341,7 +341,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s index e87fb8d1d6..25855a9dde 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s index 7664554d57..8669b83b41 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -341,7 +341,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s index 3aeac6da25..83518a0503 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x1x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x1x1_1x16x1x16.s index a48ebfcf3c..41bb799ba7 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x1x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x1x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -334,7 +334,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x1x2x1_1x16x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x1x2x1_1x16x1x8.s index fcb933fd44..4c177e5e4d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x1x2x1_1x16x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x1x2x1_1x16x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s index 8169c3e16d..b73cfde6bd 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x16x8_wt64x4x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x32_tb1x1x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x16x8_wt64x4x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x32_tb1x1x1x1_1x8x1x16.s index 907b9e2b05..d2e94744f5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x16x8_wt64x4x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x32_tb1x1x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x16x8_wt64x4x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x32_tb1x1x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -334,7 +334,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x16x8_wt64x4x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s index c98f9e374e..22be234a4b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -363,7 +363,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s index ad3c8eac18..ed8799940e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -366,7 +366,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s index 55b390c944..4f01d868c8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -363,7 +363,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s index cd0f21230d..9bc8f33517 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -366,7 +366,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s index 966de66b70..abcaad6794 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -363,7 +363,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s index c8ef1e9253..73a1c957a2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -366,7 +366,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x1x32x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x1x32x1_1x32x1x8.s index 372ada9017..9196948cc8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x1x32x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x1x32x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x32x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x32x1x1_1x1x1x256.s index ca436f22de..d21cdc7157 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x32x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x32x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -395,7 +395,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s index 2dd5ba4215..3d8dcabc01 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -349,7 +349,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s index 47ceaf60ba..8a8e2fbc53 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -346,7 +346,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s index c29bf74254..7ce32f08c1 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -349,7 +349,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s index 4965fed5de..046d7a2897 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -346,7 +346,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x2x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x2x1_1x16x1x16.s index d8b8122762..cf3436d3a6 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x2x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x2x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s index 42286b30ed..e6686373d8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s index d6bf60a5e9..53680d79c8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -333,7 +333,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x32_tb1x1x1x1_1x16x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x32_tb1x1x1x1_1x16x1x4.s index 72d3624d06..05ceabb48f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x32_tb1x1x1x1_1x16x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x32_tb1x1x1x1_1x16x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -334,7 +334,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x4x1_1x16x1x16.s index 20cb7246ca..c2c7c29af7 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x4x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -342,7 +342,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x4x1x1_1x4x1x64.s index be56a92509..218ea1c1a1 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -340,7 +340,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x1x8x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x1x8x1_1x32x1x8.s index 2955c783d5..54a978f229 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x1x8x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x1x8x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s index 1d3a42e53e..969e011b2b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x1x8x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x1x8x1_1x32x1x8.s index 9797e3539b..16daa23f4d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x1x8x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x1x8x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s index 6ce3a450f3..593227535d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s index 82dcb2a9a8..79b229b516 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -364,7 +364,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s index a8bc91e25a..1eb79a9e3f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -366,7 +366,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s index 70c0f0af6f..5457ffa67f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -364,7 +364,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s index ceb75c765d..6c7da78104 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -366,7 +366,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s index c407ae9d86..ccdd6e8161 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -338,7 +338,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s index 5962d19349..9ed1aad9e4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -336,7 +336,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x8x16_wt64x4x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x1x1x1_1x16x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x8x16_wt64x4x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x1x1x1_1x16x1x8.s index 316f8f5ada..ec34931495 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x8x16_wt64x4x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x1x1x1_1x16x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x8x16_wt64x4x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x1x1x1_1x16x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -334,7 +334,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt64x8x16_wt64x4x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x1x8x1_1x16x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x1x8x1_1x16x1x8.s index fa6367fc32..9c03ca326e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x1x8x1_1x16x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x1x8x1_1x16x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -347,7 +347,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s index efc19c94bf..14d9c96267 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -345,7 +345,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex0_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s index f3b83c31d1..c0e49448cd 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s index 68eea4844e..801b24bd09 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s index 2b1bf6cbc2..f0426207bf 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s index 7e404d5b70..14a9459b48 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s index ce57cea493..3af1a334d5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s index a64b245313..0186075edc 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s index 50fc16093a..5979b2b85e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -414,7 +414,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s index d525e4b812..009d19eafc 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -416,7 +416,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s index 45ec7ae32c..21b54aa8f2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -414,7 +414,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s index 9bc8a61536..3ae2191d27 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -416,7 +416,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x1x32x1_1x64x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x1x32x1_1x64x1x4.s index dcbdf5c092..b90ae44bfc 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x1x32x1_1x64x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x1x32x1_1x64x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -448,7 +448,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x32x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x32x1x1_1x2x1x128.s index 78dcba2fab..017d4ed338 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x32x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x32x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -446,7 +446,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s index deae2e9f57..a2eecd8d3e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -392,7 +392,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s index 26eb31b2a0..9c31df86b2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -390,7 +390,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s index 14f629d8dd..816ac9bea8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -392,7 +392,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s index b295827626..5f731f0cc2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -390,7 +390,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x1x2x1_1x16x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x1x2x1_1x16x1x8.s index fe31145d88..3af42516b1 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x1x2x1_1x16x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x1x2x1_1x16x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x2x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x2x1x1_1x8x1x16.s index 68058186d3..88846226ca 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x2x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x2x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x16x8_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x2x1x64_tb1x1x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x16x8_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x2x1x64_tb1x1x1x1_1x8x1x16.s index 1ebdb6f587..26962b9531 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x16x8_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x2x1x64_tb1x1x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x16x8_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x2x1x64_tb1x1x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -384,7 +384,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x16x8_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x2x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s index 2a7a933d59..5a544384a0 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -413,7 +413,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s index 5413c83707..33d1f0b408 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -416,7 +416,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s index 8febbf69d3..40463fbf33 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -413,7 +413,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s index f1e37d32f4..1d511b216d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -416,7 +416,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x32x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x32x1_1x32x1x8.s index b1d7f5b886..9bd9f7711a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x32x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x32x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -448,7 +448,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x32x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x32x1x1_1x1x1x256.s index 7b0f949095..5ad28458e7 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x32x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x32x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -445,7 +445,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x8x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x8x1_1x8x1x32.s index b058ed2b32..f223dd7108 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x8x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x8x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x8x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x8x1x1_1x1x1x256.s index 06c6d46747..3d87e4cd64 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x8x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x8x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -397,7 +397,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x2x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x2x1_1x16x1x16.s index bf3ec90fad..08b8585d68 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x2x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x2x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s index 85ec6b8552..c3e5fe12e3 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x32x8_wt32x8x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x32x8_wt32x8x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x1x1_1x8x1x32.s index 3899377e73..9c07ec741f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x32x8_wt32x8x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x32x8_wt32x8x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -384,7 +384,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x32x8_wt32x8x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x4x1_1x16x1x16.s index 331331e66c..9ed6d79303 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x4x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -392,7 +392,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s index 314ecd6a98..96006f450b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -390,7 +390,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x8x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x8x1_1x32x1x8.s index cfcc065094..6495b7b998 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x8x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x8x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s index e0f659e3da..e2c06953d7 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s index 9a500c36ed..d04e3ddd69 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -414,7 +414,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s index 5542fa2728..40016840fa 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -416,7 +416,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x2x1_1x8x1x32.s index 3c377f33ed..fb1ffa0985 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x2x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x2x1x1_1x4x1x64.s index c589cd88d7..afd807c981 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x2x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x2x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x16x1x1_1x1x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x16x1x1_1x1x1x128.s index 7d468866d1..5d2e66f018 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x16x1x1_1x1x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x16x1x1_1x1x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -412,7 +412,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x16x1_1x16x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x16x1_1x16x1x8.s index bf9df76ed3..7d7475bbbb 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x16x1_1x16x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x16x1_1x16x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -415,7 +415,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x8x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x8x1_1x8x1x16.s index a154586a99..4821ea7530 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x8x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x8x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -397,7 +397,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x8x1x1_1x1x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x8x1x1_1x1x1x128.s index a73afb533d..a85377730a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x8x1x1_1x1x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x8x1x1_1x1x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -394,7 +394,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x1x4x1_1x16x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x1x4x1_1x16x1x4.s index f144a804c9..e9177c5738 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x1x4x1_1x16x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x1x4x1_1x16x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -392,7 +392,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x4x1x1_1x4x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x4x1x1_1x4x1x16.s index a604deebe8..e3e87a08e1 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x4x1x1_1x4x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x4x1x1_1x4x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -390,7 +390,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x1x2x1_1x8x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x1x2x1_1x8x1x8.s index 7193cb820f..5944e874fd 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x1x2x1_1x8x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x1x2x1_1x8x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -388,7 +388,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x2x1x1_1x4x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x2x1x1_1x4x1x16.s index f7da27ebb5..a87c2b903c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x2x1x1_1x4x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x2x1x1_1x4x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -386,7 +386,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s index 29cdacf12d..b9993c5a31 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -410,7 +410,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s index bbead6f0b2..f932285fd9 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -413,7 +413,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x4x1_1x16x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x4x1_1x16x1x8.s index 15b08b88ef..5903033a14 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x4x1_1x16x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x4x1_1x16x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -391,7 +391,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x4x1x1_1x4x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x4x1x1_1x4x1x32.s index 7ef0fb7b87..77e5b7bea3 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x4x1x1_1x4x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x4x1x1_1x4x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x1x8x1_1x16x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x1x8x1_1x16x1x4.s index 4e71af80f8..3207067d40 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x1x8x1_1x16x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x1x8x1_1x16x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x8x1x1_1x2x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x8x1x1_1x2x1x32.s index 794df2802b..d9f6d058ae 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x8x1x1_1x2x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x8x1x1_1x2x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x2x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x2x1_1x8x1x16.s index 3473fcd48f..0c36d1d85f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x2x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x2x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -386,7 +386,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x2x1x1_1x4x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x2x1x1_1x4x1x32.s index 3e5504ac46..3b0275f0cc 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x2x1x1_1x4x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x2x1x1_1x4x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -384,7 +384,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x1x4x1_1x8x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x1x4x1_1x8x1x8.s index d2518d258c..8619cfdf5c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x1x4x1_1x8x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x1x4x1_1x8x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -391,7 +391,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x4x1x1_1x2x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x4x1x1_1x2x1x32.s index 2dc6928022..d590cf94c5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x4x1x1_1x2x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x4x1x1_1x2x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s index 08b0fe9b7d..b8fbcb2fec 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s index cf85450135..670bb91ea0 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x8x1_1x16x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x8x1_1x16x1x8.s index 4c2c0af2f6..de0632522d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x8x1_1x16x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x8x1_1x16x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -399,7 +399,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s index ffa906b9e4..4049656d60 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -397,7 +397,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x4x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x4x1_1x8x1x16.s index 64518aa950..4610af0513 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x4x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x4x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x4x1x1_1x2x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x4x1x1_1x2x1x64.s index bbe950a36f..a15d9a8c9f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x4x1x1_1x2x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x4x1x1_1x2x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s index 129993642b..96344124ae 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s index aab9aaeb2e..6ab1846ff3 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s index 4f0ed38c23..1aecfcd043 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s index 8b426fe43a..dd237ca0dd 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s index 82bb1a25ed..914eaef41c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -414,7 +414,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s index c120b4bdfd..9052532c07 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -416,7 +416,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x4x1_1x8x1x32.s index 804f47692e..5593671737 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -392,7 +392,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s index e9f44c9887..9e971db1f0 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -390,7 +390,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x1x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x1x1_1x16x1x16.s index f2ccfbecb2..db09cadac7 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x1x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x1x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -384,7 +384,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x8x2x1_1x2x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x2x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x2x1_1x16x1x16.s index 40f424c9a0..b26f492a25 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x2x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x2x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s index a310a5aca8..7cfb8b5c6a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x32x8_wt64x4x4_ws1x2_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x32x8_wt64x4x4_ws1x2_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x1x1_1x8x1x32.s index b863109f49..3cee70485e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x32x8_wt64x4x4_ws1x2_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x32x8_wt64x4x4_ws1x2_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -384,7 +384,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x32x8_wt64x4x4_ws1x2_wr2x2_ta1x4x2x1_1x2x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x4x1_1x16x1x16.s index d580793fe9..45e02f1d06 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x4x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -392,7 +392,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x4x1x1_1x4x1x64.s index 25f60b3f77..eb4decc06f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -390,7 +390,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x2x1_1x8x1x32.s index 9284f10771..de468980f5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x2x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x2x1x1_1x4x1x64.s index 1951ef4668..16ba5607e2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x2x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x2x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s index 0e6b860340..ffb645ee9a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -399,7 +399,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s index ee66dfbeb5..d920e8a39a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -397,7 +397,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s index 43c4892485..e7a5779d38 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -399,7 +399,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s index ae2e425899..1611effbb4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -397,7 +397,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s index 0c31ac5495..ffa81c7a18 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s index 51f9406086..3c2a8c5968 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s index e6da8e3efb..868cdf23c5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s index 5e10288cba..c40a590b5d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x1x2x1_1x16x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x1x2x1_1x16x1x8.s index e9648bdcba..d8fd0d2a87 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x1x2x1_1x16x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x1x2x1_1x16x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x2x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x2x1x1_1x8x1x16.s index fa158422de..72de33a401 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x2x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x2x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x1x4x1_1x16x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x1x4x1_1x16x1x4.s index 423c9f61df..67dd542b75 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x1x4x1_1x16x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x1x4x1_1x16x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -392,7 +392,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x4x1x1_1x4x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x4x1x1_1x4x1x16.s index 2e08a17bed..bb89e0ed23 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x4x1x1_1x4x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x4x1x1_1x4x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -390,7 +390,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x8_wt32x8x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x16_tb1x1x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x8_wt32x8x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x16_tb1x1x1x1_1x8x1x16.s index 6d899de1ac..ac1261cfdc 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x8_wt32x8x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x16_tb1x1x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x8_wt32x8x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x16_tb1x1x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -383,7 +383,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x8_wt32x8x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x1x2x1_1x8x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x1x2x1_1x8x1x8.s index e9051e9b95..c0242cfb01 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x1x2x1_1x8x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x1x2x1_1x8x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x2x1x1_1x4x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x2x1x1_1x4x1x16.s index 96503bafec..c67ac79962 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x2x1x1_1x4x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x2x1x1_1x4x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s index 0ad4186b19..2104f18c3c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -412,7 +412,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s index 3d4c006010..0cadc28873 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -415,7 +415,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s index 7197eacc69..f4e39caef7 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -397,7 +397,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s index 221b87bbd6..526560ee15 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -394,7 +394,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x2x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x2x1_1x16x1x16.s index ead5ffb8fc..82a5a0ff6b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x2x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x2x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -388,7 +388,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x2x1x1_1x8x1x32.s index 8a4fde01d5..c86e8a971f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x2x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -386,7 +386,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x1x4x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x1x4x1_1x32x1x8.s index 475a675f87..6535f64a01 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x1x4x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x1x4x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -392,7 +392,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x4x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x4x1x1_1x8x1x32.s index f2944231c6..a13c144b63 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x4x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x4x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -390,7 +390,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x1x8x1_1x64x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x1x8x1_1x64x1x4.s index 0335814914..cba339a81c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x1x8x1_1x64x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x1x8x1_1x64x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x8x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x8x1x1_1x8x1x32.s index d48486c65c..17d67bfa9b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x8x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x8x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s index 8e126097d3..834d27b79e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -381,7 +381,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s index 0069bc8df4..c2f052eb67 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -391,7 +391,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s index 03d62ca174..851e31094d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s index 22640e9240..3ff8d9d41d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -386,7 +386,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s index e069ff2527..5ed6004a3e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -384,7 +384,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x16x1x1_1x1x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x16x1x1_1x1x1x64.s index c0a71ee636..1d04285f39 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x16x1x1_1x1x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x16x1x1_1x1x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -410,7 +410,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x1x16x1_1x16x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x1x16x1_1x16x1x4.s index 67205cfcec..5881c02f16 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x1x16x1_1x16x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x1x16x1_1x16x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -413,7 +413,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s index c58b2d891c..628207322c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s index 9cb4d9d1c2..85659a2727 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s index dc4c36a8d1..b2ab9b4d27 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s index 28a4773638..3616fbff2b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s index 4474c1e6e9..c86690beaa 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s index 182b6f068f..cc36340d80 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s index 825e12749d..3244f49d71 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s index ed68c53b77..042d26cc2c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x16x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x16x1x1_1x2x1x128.s index f4edf7f7a2..40bc74631b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x16x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x16x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -414,7 +414,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x1x16x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x1x16x1_1x32x1x8.s index a737cb6755..d94035a7ce 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x1x16x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x1x16x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -416,7 +416,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s index 659678a10f..680be29b54 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -391,7 +391,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s index effc1e9c0d..5f3193d151 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s index 4f99f577a2..7cb307f494 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -391,7 +391,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s index bc5e4adb91..b5e98573a2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s index fd17458207..a136a1e0ec 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -391,7 +391,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s index 3e4334bc27..666bce459b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x1x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x1x1_1x16x1x16.s index 354da97df5..32631f5f7b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x1x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x1x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -384,7 +384,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x1x2x1_1x16x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x1x2x1_1x16x1x8.s index fa5f506caf..50d0b90805 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x1x2x1_1x16x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x1x2x1_1x16x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s index 4242eb2d4a..cf2a16f22f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x16x8_wt64x4x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x32_tb1x1x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x16x8_wt64x4x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x32_tb1x1x1x1_1x8x1x16.s index 1db53cf5a7..979a64ec62 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x16x8_wt64x4x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x32_tb1x1x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x16x8_wt64x4x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x32_tb1x1x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -384,7 +384,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x16x8_wt64x4x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s index 379e2571d4..079308bebd 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -413,7 +413,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s index db07e5a434..304a227065 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -416,7 +416,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s index f1824bb019..bc9c2d29e4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -413,7 +413,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s index 0b4a8c4971..c1cd50328b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -416,7 +416,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s index 2534adaf17..3864cae439 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -413,7 +413,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s index 99460a461b..1c279a7224 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -416,7 +416,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x1x32x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x1x32x1_1x32x1x8.s index 77c1626536..1990624d4f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x1x32x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x1x32x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -448,7 +448,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x32x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x32x1x1_1x1x1x256.s index cd7b4caafd..5ca4654401 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x32x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x32x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -445,7 +445,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s index 8d978e3211..a84dd60319 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -399,7 +399,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s index 74a25a049f..5cf1f7731e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -396,7 +396,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s index 8a1ec0f6bc..6e2425d97c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -399,7 +399,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s index 5d00ee4d93..29564e64e6 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -396,7 +396,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x2x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x2x1_1x16x1x16.s index c5fb30974a..493c053ceb 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x2x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x2x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s index 9f73a2f92c..aac119adaf 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s index 8db2422ea4..be1388900c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -383,7 +383,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x32_tb1x1x1x1_1x16x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x32_tb1x1x1x1_1x16x1x4.s index ff22f3314c..59bb8b5f28 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x32_tb1x1x1x1_1x16x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x32_tb1x1x1x1_1x16x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -384,7 +384,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x4x1_1x16x1x16.s index b241e76a9a..802bb5cbcb 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x4x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -392,7 +392,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x4x1x1_1x4x1x64.s index 9ba63298f4..8066c1ac0e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -390,7 +390,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x1x8x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x1x8x1_1x32x1x8.s index 208e9a43f1..e9c0aa1959 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x1x8x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x1x8x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s index de28b4a9c7..35ec3dd032 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x1x8x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x1x8x1_1x32x1x8.s index 4a2fb0f2f5..fcf06a2a5f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x1x8x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x1x8x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s index 71e6b099fe..e6c454f261 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s index 1366c81f9f..88cc6269c0 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -414,7 +414,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s index f33910494d..59f9a1bd9f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -416,7 +416,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s index afaf867300..d1f0f3f7c0 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -414,7 +414,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s index 8ae3d4a63b..30daa98d0a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -416,7 +416,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s index a958769a8a..17e783e561 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -388,7 +388,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s index 9a6cdfb094..1b07c77e72 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -386,7 +386,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x8x16_wt64x4x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x1x1x1_1x16x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x8x16_wt64x4x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x1x1x1_1x16x1x8.s index e7fd594a30..14a69cd14c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x8x16_wt64x4x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x1x1x1_1x16x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x8x16_wt64x4x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x1x1x1_1x16x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -384,7 +384,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt64x8x16_wt64x4x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x1x8x1_1x16x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x1x8x1_1x16x1x8.s index 6f14808b62..b18146be7d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x1x8x1_1x16x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x1x8x1_1x16x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -397,7 +397,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s index f2f6de297f..2548cf88d6 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -395,7 +395,6 @@ igemm_fwd_gtcx_nchw_fp16_bx1_ex1_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s index d2f0d8ac9e..a4e7bd3517 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s index 36e6ea3c55..5474b2cee6 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s index 67445d8e1b..293fe74915 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s index e406d7b189..90001bf8b8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s index 4d5b077c94..06a57b965f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s index 452cf41149..0114187a64 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s index 95787006e1..c2496b10d0 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -364,7 +364,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s index 5058818f60..3d38dd4857 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -366,7 +366,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s index b696b8a153..fba53a34f4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -364,7 +364,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s index 165b1ca368..cf6e600037 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -366,7 +366,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x1x32x1_1x64x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x1x32x1_1x64x1x4.s index f5b16ab4d2..c6532b6537 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x1x32x1_1x64x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x1x32x1_1x64x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x32x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x32x1x1_1x2x1x128.s index df08eee3c7..7835577718 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x32x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x32x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -396,7 +396,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s index 9616be6145..63011cb494 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -342,7 +342,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s index c5a449d606..33ec05502a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -340,7 +340,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s index 9556be8bb2..32771c261a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -342,7 +342,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s index 5a5b29ab83..b735181153 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -340,7 +340,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x1x2x1_1x16x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x1x2x1_1x16x1x8.s index f802052eb1..4e534b62ad 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x1x2x1_1x16x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x1x2x1_1x16x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x2x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x2x1x1_1x8x1x16.s index 0a545a8456..fa6124e498 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x2x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x2x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x16x8_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x2x1x64_tb1x1x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x16x8_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x2x1x64_tb1x1x1x1_1x8x1x16.s index 02358b5c85..93b69ea135 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x16x8_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x2x1x64_tb1x1x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x16x8_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x2x1x64_tb1x1x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -334,7 +334,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x16x8_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x2x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s index 21b33e7f31..23260da36c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -363,7 +363,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s index 6eace7a3b3..8726c26f01 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -366,7 +366,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s index 2f477ce629..e8cc4e9a98 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -363,7 +363,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s index 5e4f9bd4fd..3d0dc52cb6 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -366,7 +366,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x32x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x32x1_1x32x1x8.s index 21f29fd10f..8414b22dc2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x32x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x32x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x32x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x32x1x1_1x1x1x256.s index 0f329e3436..36e29c1353 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x32x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x32x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -395,7 +395,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x8x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x8x1_1x8x1x32.s index 953b31cf0a..af831d8b04 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x8x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x8x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x8x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x8x1x1_1x1x1x256.s index 2f85a062e5..4a7ba84fb8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x8x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x8x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -347,7 +347,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x2x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x2x1_1x16x1x16.s index db63023fdc..d0d8ba9c33 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x2x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x2x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s index 77abb4430e..d2d87fea21 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x32x8_wt32x8x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x32x8_wt32x8x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x1x1_1x8x1x32.s index e12bc8dbc4..81f83067a2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x32x8_wt32x8x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x32x8_wt32x8x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -334,7 +334,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x32x8_wt32x8x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x4x1_1x16x1x16.s index 1669afd55e..5206a8a06a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x4x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -342,7 +342,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s index 293b6def72..3b9f83f0a2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -340,7 +340,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x8x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x8x1_1x32x1x8.s index eff9830242..16c8aeb171 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x8x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x8x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s index 5dbcf30f80..7ec710330f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s index 17d2fa7d28..88f8e4e0a3 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -364,7 +364,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s index 8db14177b4..437abadde6 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -366,7 +366,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x2x1_1x8x1x32.s index 6e9eb68f7b..55f1a45e2c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x2x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x2x1x1_1x4x1x64.s index 1c14bfdf24..5f5cf8655f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x2x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x2x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x16x1x1_1x1x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x16x1x1_1x1x1x128.s index d373c5201c..1fce7fe516 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x16x1x1_1x1x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x16x1x1_1x1x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -362,7 +362,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x16x1_1x16x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x16x1_1x16x1x8.s index 4d97488508..47e752222f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x16x1_1x16x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x16x1_1x16x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -365,7 +365,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x8x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x8x1_1x8x1x16.s index f5600bb8c6..acdfc813fd 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x8x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x8x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -347,7 +347,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x8x1x1_1x1x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x8x1x1_1x1x1x128.s index 469a25b263..f5bc201702 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x8x1x1_1x1x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x8x1x1_1x1x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -344,7 +344,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x1x4x1_1x16x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x1x4x1_1x16x1x4.s index c37e3593a7..19fd5f13de 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x1x4x1_1x16x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x1x4x1_1x16x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -342,7 +342,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x4x1x1_1x4x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x4x1x1_1x4x1x16.s index ea19be5077..fddd8cc6a4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x4x1x1_1x4x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x4x1x1_1x4x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -340,7 +340,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x1x2x1_1x8x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x1x2x1_1x8x1x8.s index 6fdb3200f5..f6658f74ee 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x1x2x1_1x8x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x1x2x1_1x8x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -338,7 +338,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x2x1x1_1x4x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x2x1x1_1x4x1x16.s index 37bce83976..6f1cdfeb2f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x2x1x1_1x4x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x2x1x1_1x4x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -336,7 +336,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s index b42ce299b4..061e751e4f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -360,7 +360,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s index af0897c2de..1e8ca28d49 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -363,7 +363,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x4x1_1x16x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x4x1_1x16x1x8.s index 8de9d4cc4f..9863d2577f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x4x1_1x16x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x4x1_1x16x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -341,7 +341,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x4x1x1_1x4x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x4x1x1_1x4x1x32.s index f0ed03c5cf..f19d6d5b7d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x4x1x1_1x4x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x4x1x1_1x4x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x1x8x1_1x16x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x1x8x1_1x16x1x4.s index 4d24c10bbe..785c418ffb 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x1x8x1_1x16x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x1x8x1_1x16x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x8x1x1_1x2x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x8x1x1_1x2x1x32.s index 3f7f9f8ff2..1d6528f26a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x8x1x1_1x2x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x8x1x1_1x2x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x2x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x2x1_1x8x1x16.s index 2f86dbcafd..bd07f4cc0c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x2x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x2x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -336,7 +336,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x2x1x1_1x4x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x2x1x1_1x4x1x32.s index 39658ff1f5..37d39b7ce1 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x2x1x1_1x4x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x2x1x1_1x4x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -334,7 +334,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x1x4x1_1x8x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x1x4x1_1x8x1x8.s index 40a196415a..0644863a28 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x1x4x1_1x8x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x1x4x1_1x8x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -341,7 +341,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x4x1x1_1x2x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x4x1x1_1x2x1x32.s index 516730fe01..d5b95c2675 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x4x1x1_1x2x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x4x1x1_1x2x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s index cc1e8a1417..a5d2f59e4f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s index 019034bf86..16758af9f1 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x8x1_1x16x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x8x1_1x16x1x8.s index debca18991..40caaa0d69 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x8x1_1x16x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x8x1_1x16x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -349,7 +349,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s index c19af33302..a5474ceae7 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -347,7 +347,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x4x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x4x1_1x8x1x16.s index de2a8c0171..5859c26911 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x4x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x4x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x4x1x1_1x2x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x4x1x1_1x2x1x64.s index 82d15f18d3..a0da0342ac 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x4x1x1_1x2x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x4x1x1_1x2x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s index c9e6d99910..e95506d911 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s index 88f3605d78..5d49b5399f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s index c766edfa0b..33625d9313 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s index 22ee1eff7b..47bce00441 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s index 2fbe355b27..b0cc179721 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -364,7 +364,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s index 52e422ef12..b51204db5a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -366,7 +366,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x4x1_1x8x1x32.s index 0e32819285..c8a096594b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -342,7 +342,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s index 6855007f88..c1f15455c7 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -340,7 +340,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x1x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x1x1_1x16x1x16.s index 728f1f6aa4..2dc06db037 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x1x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x1x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -334,7 +334,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x8x2x1_1x2x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x2x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x2x1_1x16x1x16.s index 9413cc4429..a4b843e776 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x2x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x2x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s index 50d18bb73c..5079eac70d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x32x8_wt64x4x4_ws1x2_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x32x8_wt64x4x4_ws1x2_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x1x1_1x8x1x32.s index d15e4f2143..571711500f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x32x8_wt64x4x4_ws1x2_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x32x8_wt64x4x4_ws1x2_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -334,7 +334,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x32x8_wt64x4x4_ws1x2_wr2x2_ta1x4x2x1_1x2x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x4x1_1x16x1x16.s index 8068c9d6f0..a7b7b37424 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x4x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -342,7 +342,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x4x1x1_1x4x1x64.s index c03c9cfb4f..430a3f35e0 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -340,7 +340,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x2x1_1x8x1x32.s index 1b7bf2c0e0..5232c53566 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x2x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x2x1x1_1x4x1x64.s index 62600f1caa..9ad31bdac5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x2x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x2x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s index 115e3806f6..c4fc31a027 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -349,7 +349,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s index c380015fcf..5037a0f475 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -347,7 +347,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s index 07b2b30014..e5c733ac78 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -349,7 +349,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s index 23592cd121..cafd496a88 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -347,7 +347,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s index 41fe326e88..68370fa226 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s index acc611545b..4d9f1fa070 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s index 4ff5bfd07d..8f18c983de 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s index 8654eb53d7..b583ce6f33 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x1x2x1_1x16x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x1x2x1_1x16x1x8.s index 7c08034edd..05105296e9 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x1x2x1_1x16x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x1x2x1_1x16x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x2x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x2x1x1_1x8x1x16.s index b6a0c45f37..4e2e570cc5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x2x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x2x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x1x4x1_1x16x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x1x4x1_1x16x1x4.s index df5759dd8a..b1b0521b2d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x1x4x1_1x16x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x1x4x1_1x16x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -342,7 +342,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x4x1x1_1x4x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x4x1x1_1x4x1x16.s index 83ae861399..c5d3926dea 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x4x1x1_1x4x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x4x1x1_1x4x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -340,7 +340,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x8_wt32x8x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x16_tb1x1x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x8_wt32x8x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x16_tb1x1x1x1_1x8x1x16.s index c7184f378c..12687fa542 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x8_wt32x8x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x16_tb1x1x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x8_wt32x8x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x16_tb1x1x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -333,7 +333,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x8_wt32x8x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x1x2x1_1x8x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x1x2x1_1x8x1x8.s index 8f9aef5deb..8c6f908ac7 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x1x2x1_1x8x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x1x2x1_1x8x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x2x1x1_1x4x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x2x1x1_1x4x1x16.s index de6ebb47c7..3ab1199210 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x2x1x1_1x4x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x2x1x1_1x4x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s index 87927e1570..a6ded50908 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -362,7 +362,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s index e99fbf1dc8..bc6d504023 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -365,7 +365,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s index c26ee471c6..afd8e12eb9 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -347,7 +347,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s index 0d114e656a..6d80ee57a3 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -344,7 +344,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x2x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x2x1_1x16x1x16.s index 4566ffc65f..f34a902a99 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x2x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x2x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -338,7 +338,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x2x1x1_1x8x1x32.s index 3cb1660140..b37d86e24c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x2x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -336,7 +336,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x1x4x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x1x4x1_1x32x1x8.s index ca69cec44d..c0ffa504aa 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x1x4x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x1x4x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -342,7 +342,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x4x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x4x1x1_1x8x1x32.s index 34f7c81ac5..20cde32b4c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x4x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x4x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -340,7 +340,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x1x8x1_1x64x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x1x8x1_1x64x1x4.s index e1bb5a27b5..d9dd759246 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x1x8x1_1x64x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x1x8x1_1x64x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x8x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x8x1x1_1x8x1x32.s index afdea86d98..551734ab1d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x8x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x8x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s index a1a2cc8847..cb1f178939 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -331,7 +331,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s index 9cd1b78bb5..b06290f394 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -341,7 +341,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s index a81969cc76..b4f9b16657 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s index 8c4af37f2d..fb21b0cc82 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -336,7 +336,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s index 695c925a5a..a88982e26a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -334,7 +334,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x16x1x1_1x1x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x16x1x1_1x1x1x64.s index 2ee5bb8105..d600eb8489 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x16x1x1_1x1x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x16x1x1_1x1x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -360,7 +360,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x1x16x1_1x16x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x1x16x1_1x16x1x4.s index 7e102d8cfd..be6af3b795 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x1x16x1_1x16x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x1x16x1_1x16x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -363,7 +363,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s index 2ba51bf59d..105af4733e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s index b15abc66ab..953a32b250 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s index e332dd43be..fab620e287 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s index 99b2489963..7cb165244a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s index cd3d559eb8..3c35720c8c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s index 841af33ee1..c26b0dd4b4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s index efd3131bb8..07a50baf96 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s index 0e488127ad..f367fc87d9 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x16x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x16x1x1_1x2x1x128.s index 422575ea56..0d4fab6b41 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x16x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x16x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -364,7 +364,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x1x16x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x1x16x1_1x32x1x8.s index 9e44739ab0..caa3734670 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x1x16x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x1x16x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -366,7 +366,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s index 2fc0209362..3aec0e999e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -341,7 +341,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s index 69d369a878..67c2dd5c24 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s index cfd763ea0c..3700b8af0c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -341,7 +341,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s index 2d61b42c52..df50404009 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s index 41c7546897..ffb5216caf 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -341,7 +341,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s index 03feadd13a..3d88472952 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x1x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x1x1_1x16x1x16.s index d1f4f4bdf5..2ed687c2a2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x1x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x1x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -334,7 +334,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x1x2x1_1x16x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x1x2x1_1x16x1x8.s index c82396b3c4..cfc5737b10 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x1x2x1_1x16x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x1x2x1_1x16x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s index a898bd1460..61ce86a04f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x16x8_wt64x4x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x32_tb1x1x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x16x8_wt64x4x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x32_tb1x1x1x1_1x8x1x16.s index b2633ef4b6..f8ee51f590 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x16x8_wt64x4x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x32_tb1x1x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x16x8_wt64x4x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x32_tb1x1x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -334,7 +334,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x16x8_wt64x4x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s index f1125146d6..afa5786557 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -363,7 +363,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s index 664bbe1d90..b8c3938fd1 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -366,7 +366,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s index 8b0bce87be..7a36bee0ae 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -363,7 +363,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s index 821a7c2594..ac71f52e95 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -366,7 +366,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s index d2184597c1..805c184c88 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -363,7 +363,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s index 18495db647..de3548df3a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -366,7 +366,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x1x32x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x1x32x1_1x32x1x8.s index 52f2ab832a..bcf39301ed 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x1x32x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x1x32x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x32x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x32x1x1_1x1x1x256.s index 7600909d47..6d1b1fc04d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x32x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x32x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -395,7 +395,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s index bcef1ee260..08fabcc689 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -349,7 +349,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s index 51b687ec7a..a7bc0a655d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -346,7 +346,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s index b90bbc5b90..fae5e4a990 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -349,7 +349,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s index a9c6d17734..c72b649b4a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -346,7 +346,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x2x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x2x1_1x16x1x16.s index e8aebde835..808e34fc03 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x2x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x2x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -339,7 +339,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s index 77a1e61382..25500500fe 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -337,7 +337,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s index 1e3b191924..e6864b9901 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -333,7 +333,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x32_tb1x1x1x1_1x16x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x32_tb1x1x1x1_1x16x1x4.s index 964dec8534..8415de86a9 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x32_tb1x1x1x1_1x16x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x32_tb1x1x1x1_1x16x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -334,7 +334,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x4x1_1x16x1x16.s index b75585b655..2107d49852 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x4x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -342,7 +342,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x4x1x1_1x4x1x64.s index 9ac5de9c61..821e034906 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -340,7 +340,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x1x8x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x1x8x1_1x32x1x8.s index 5f90f0b4cb..dccb9da9c8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x1x8x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x1x8x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s index db60a00f62..c8a71781ff 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x1x8x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x1x8x1_1x32x1x8.s index 48092cca61..742b51be14 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x1x8x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x1x8x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -350,7 +350,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s index 2fdc6e10ab..bf5cd743d8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -348,7 +348,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s index 6f9921a540..e33756d033 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -364,7 +364,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s index 6b1df710a6..544bf929e1 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -366,7 +366,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s index 38f13baded..a30c43aaa1 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -364,7 +364,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s index d8686ed91b..0a5ea20709 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -366,7 +366,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s index ac2ccdf49a..e4f3cecbb7 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -338,7 +338,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s index 2820eec326..54f5805f59 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -336,7 +336,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x8x16_wt64x4x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x1x1x1_1x16x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x8x16_wt64x4x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x1x1x1_1x16x1x8.s index cbb6d76405..7958d1e3e9 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x8x16_wt64x4x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x1x1x1_1x16x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x8x16_wt64x4x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x1x1x1_1x16x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -334,7 +334,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt64x8x16_wt64x4x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x1x8x1_1x16x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x1x8x1_1x16x1x8.s index 99e2a3b169..d94f6dc5b9 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x1x8x1_1x16x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x1x8x1_1x16x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -347,7 +347,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s index 7d78f27676..fa9a9dafa4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -345,7 +345,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex0_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_c], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s index 459d4fc1d2..6cf4ee21d6 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s index 49bfb9f8ea..3001e79dc1 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s index 00034699f2..a28947a874 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s index 9e7f25e1fb..9aa3d1a584 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s index c3c61d4500..aacaa4f19e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s index e7f33edcc4..d50993f563 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x4x1x64_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x16_wt32x64x4_ws2x1_wr1x1_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s index 0c5dbdfc42..e35da72506 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -414,7 +414,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s index 116994bad3..13102280cd 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -416,7 +416,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x32_wt16x16x16_ws2x2_wr2x2_ta1x8x2x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s index 8e0d2ee388..dbc776b54b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -414,7 +414,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s index 6a2b7addec..42e5ed5b99 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -416,7 +416,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x32_wt32x32x8_ws1x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x1x32x1_1x64x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x1x32x1_1x64x1x4.s index f788b92d67..7e06d97286 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x1x32x1_1x64x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x1x32x1_1x64x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -448,7 +448,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x32x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x32x1x1_1x2x1x128.s index 7c932160de..068ff4d6a2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x32x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1x8x1x32_tb1x32x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -446,7 +446,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x64_wt16x16x16_ws2x2_wr2x2_ta1x8x4x1_1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s index f1c11c9728..5531538bd4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -392,7 +392,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s index 6e184733bd..84ab126546 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -390,7 +390,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x8_wt32x32x4_ws1x1_wr2x2_ta1x2x2x1_1x4 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s index dae9212630..7d74cb2615 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -392,7 +392,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s index bfbebde310..c6ef6ca0ce 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4x1x64_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -390,7 +390,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x128x8_wt32x64x4_ws2x1_wr1x1_ta1x2x2x1_1x4 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x1x2x1_1x16x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x1x2x1_1x16x1x8.s index 042780612b..f9b7fd8c0a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x1x2x1_1x16x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x1x2x1_1x16x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x2x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x2x1x1_1x8x1x16.s index 1a7dea766f..81687c6542 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x2x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x64_tb1x2x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x16x16_wt64x16x4_ws1x1_wr1x1_ta1x8x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x16x8_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x2x1x64_tb1x1x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x16x8_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x2x1x64_tb1x1x1x1_1x8x1x16.s index d8ecfa6025..80bc27af06 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x16x8_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x2x1x64_tb1x1x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x16x8_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x2x1x64_tb1x1x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -384,7 +384,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x16x8_wt64x16x4_ws1x1_wr1x1_ta1x4x2x1_1x2x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s index 2b8ab12ded..3b45e0a89b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -413,7 +413,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s index 2ff5e5bc44..5e17894810 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -416,7 +416,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x32x8_ws1x2_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s index 3db946f665..acdf21df28 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -413,7 +413,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s index a8042a4f45..cb80064d0e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -416,7 +416,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x16_wt32x64x4_ws1x1_wr2x2_ta1x4x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x32x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x32x1_1x32x1x8.s index d9bc8ef52c..f798b98953 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x32x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x32x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -448,7 +448,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x32x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x32x1x1_1x1x1x256.s index 2963f44d79..d2d9c933a6 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x32x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x4x1x64_tb1x32x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -445,7 +445,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x32_wt32x32x8_ws1x2_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x8x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x8x1_1x8x1x32.s index edc36bdfdc..28b66127b4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x8x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x8x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x8x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x8x1x1_1x1x1x256.s index 61a13d6fc1..8f72ac6b4e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x8x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x8x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -397,7 +397,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x256x8_wt32x64x4_ws1x1_wr2x2_ta1x2x2x1_1x4 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x2x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x2x1_1x16x1x16.s index 431569c3a2..2ae0df5d7d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x2x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x2x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s index 041d3c3b25..00ac2b258f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x1x64_tb1x2x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x32x16_wt32x8x4_ws1x1_wr2x2_ta1x4x2x1_1x4x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x32x8_wt32x8x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x32x8_wt32x8x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x1x1_1x8x1x32.s index 11d1424771..8cbaa71d20 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x32x8_wt32x8x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x32x8_wt32x8x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -384,7 +384,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x32x8_wt32x8x4_ws1x1_wr2x2_ta1x2x2x1_1x4x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x4x1_1x16x1x16.s index 7d7fad676d..d74bbeca17 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x1x4x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -392,7 +392,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s index 1f94518327..1faacb54b3 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x1x64_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -390,7 +390,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x16_wt32x8x4_ws1x2_wr2x2_ta1x4x2x1_1x4x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x8x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x8x1_1x32x1x8.s index 401c199d62..c927097b7c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x8x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x1x8x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s index c51922f32d..afb05b1b8e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x4x1x64_tb1x8x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x32_wt16x16x16_ws2x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s index b4d1931552..ff7575aac5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -414,7 +414,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s index 1e2c7a7006..50565edff2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -416,7 +416,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x64_wt16x16x16_ws2x1_wr2x2_ta1x8x4x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x2x1_1x8x1x32.s index c30445cbed..d7c035061f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x1x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x2x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x2x1x1_1x4x1x64.s index a53b3ff7b6..50b1366ec1 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x2x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1x64_tb1x2x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt128x64x8_wt32x8x4_ws1x2_wr2x2_ta1x2x2x1_1x4x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x16x1x1_1x1x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x16x1x1_1x1x1x128.s index d442a2ebbe..e4914f157b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x16x1x1_1x1x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x16x1x1_1x1x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -412,7 +412,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x16x1_1x16x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x16x1_1x16x1x8.s index 2280c441ff..3a79f36190 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x16x1_1x16x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x16x1_1x16x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -415,7 +415,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x8x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x8x1_1x8x1x16.s index c58b0d49f9..3f9585226b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x8x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x8x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -397,7 +397,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x8x1x1_1x1x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x8x1x1_1x1x1x128.s index b37af5ba9c..93a102344e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x8x1x1_1x1x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x8x1x1_1x1x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -394,7 +394,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x1x4x1_1x16x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x1x4x1_1x16x1x4.s index c8724f5cbd..84c52d7c28 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x1x4x1_1x16x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x1x4x1_1x16x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -392,7 +392,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x4x1x1_1x4x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x4x1x1_1x4x1x16.s index 0a1d8a5e6e..33bb62da76 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x4x1x1_1x4x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x4x1x1_1x4x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -390,7 +390,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x16x16_wt16x16x4_ws1x1_wr1x1_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x1x2x1_1x8x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x1x2x1_1x8x1x8.s index 447493cd5f..3b3d61dfd2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x1x2x1_1x8x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x1x2x1_1x8x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -388,7 +388,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x2x1x1_1x4x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x2x1x1_1x4x1x16.s index 5015bbe1a5..411799ef76 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x2x1x1_1x4x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x2x1x1_1x4x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -386,7 +386,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x16x8_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s index b4f22446e4..b4f7ca092c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -410,7 +410,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s index dc5f9118cb..5f7d04013e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -413,7 +413,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x256x16_wt4x64x4_ws1x1_wr2x2_ta1x1x1x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x4x1_1x16x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x4x1_1x16x1x8.s index f221f99e09..314c6e0550 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x4x1_1x16x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x4x1_1x16x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -391,7 +391,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x4x1x1_1x4x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x4x1x1_1x4x1x32.s index 835e463746..cce2e8b275 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x4x1x1_1x4x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x4x1x1_1x4x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x16_wt8x32x4_ws1x1_wr1x1_ta1x1x2x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x1x8x1_1x16x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x1x8x1_1x16x1x4.s index e50b23faed..8c8045a617 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x1x8x1_1x16x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x1x8x1_1x16x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x8x1x1_1x2x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x8x1x1_1x2x1x32.s index d074d1d330..e7aefd2e16 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x8x1x1_1x2x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1x8_tb1x8x1x1_1x2x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x16_wt8x32x4_ws2x1_wr1x1_ta1x2x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x2x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x2x1_1x8x1x16.s index 465c92d4c5..d7b1d42f0f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x2x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x2x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -386,7 +386,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x2x1x1_1x4x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x2x1x1_1x4x1x32.s index a29e068582..b17874de47 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x2x1x1_1x4x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x2x1x1_1x4x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -384,7 +384,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x8_wt8x32x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x1x4x1_1x8x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x1x4x1_1x8x1x8.s index 763d2b4892..781e4436be 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x1x4x1_1x8x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x1x4x1_1x8x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -391,7 +391,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x4x1x1_1x2x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x4x1x1_1x2x1x32.s index 0ca7334e06..dcec10b0c8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x4x1x1_1x2x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x8_tb1x4x1x1_1x2x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x32x8_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s index 06ddc04567..0a502fe5ad 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s index 5ff36b08f5..bc19a9a9cc 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x8x1_1x16x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x8x1_1x16x1x8.s index 8798e012f1..3541fe5f43 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x8x1_1x16x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x1x8x1_1x16x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -399,7 +399,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s index bff4fb4f50..0d19e47992 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -397,7 +397,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x64x16_wt4x64x4_ws2x1_wr1x1_ta1x1x2x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x4x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x4x1_1x8x1x16.s index 0b054e9f12..211d712ccd 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x4x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x1x4x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x4x1x1_1x2x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x4x1x1_1x2x1x64.s index 595801ce6f..1d8d459758 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x4x1x1_1x2x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x16_tb1x4x1x1_1x2x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt16x64x8_wt4x64x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s index 479b0b9058..fca9804412 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s index d678119d6d..ce441c6511 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt32x32x8_ws2x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s index 1ec9c68e95..3a4a71c8df 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s index 3ac072f58f..7b65d30de8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x16_wt64x32x4_ws1x1_wr2x2_ta1x8x2x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s index 92c1c45b8b..cafbe54717 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x16x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -414,7 +414,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s index 2b97866259..2c1643ee25 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x4x1x64_tb1x1x16x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -416,7 +416,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x32_wt32x32x8_ws2x1_wr2x2_ta1x8x4x1_1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x4x1_1x8x1x32.s index ef78f6746d..6ad52ee3a6 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -392,7 +392,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s index f3a94a4055..c1bceb050c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -390,7 +390,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x128x8_wt64x32x4_ws1x1_wr2x2_ta1x4x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x1x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x1x1_1x16x1x16.s index 604cff71d7..700d394093 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x1x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x1x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -384,7 +384,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x16x16_wt64x4x4_ws1x1_wr2x2_ta1x8x2x1_1x2x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x2x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x2x1_1x16x1x16.s index 81a359897f..0716613ed3 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x2x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x2x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s index 9bdf121a77..d9dd1fd0f4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x1x128_tb1x2x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x32x16_wt64x4x4_ws1x2_wr2x2_ta1x8x2x1_1x2x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x32x8_wt64x4x4_ws1x2_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x32x8_wt64x4x4_ws1x2_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x1x1_1x8x1x32.s index 5b6f0a21af..1071f34bb5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x32x8_wt64x4x4_ws1x2_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x32x8_wt64x4x4_ws1x2_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -384,7 +384,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x32x8_wt64x4x4_ws1x2_wr2x2_ta1x4x2x1_1x2x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x4x1_1x16x1x16.s index 2edc38d7b1..16297e6955 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x1x4x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -392,7 +392,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x4x1x1_1x4x1x64.s index 2885d9d56e..5c52dd1d4f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2x1x128_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -390,7 +390,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x64x16_wt64x16x4_ws1x1_wr2x2_ta1x8x2x1_1x2 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x2x1_1x8x1x32.s index 05281951bb..f6cbb9251b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x1x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x2x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x2x1x1_1x4x1x64.s index 7e6d52bdbe..b735496d4b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x2x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x1x128_tb1x2x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt256x64x8_wt64x16x4_ws1x1_wr2x2_ta1x4x2x1_1x2x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s index 29bf618296..2341681f91 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -399,7 +399,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s index 78cc3b4209..4d48b068eb 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -397,7 +397,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt16x64x4_ws1x1_wr1x1_ta1x1x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s index 94a182642a..eb96564b99 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -399,7 +399,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s index c1d1c95292..8dafb8a313 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -397,7 +397,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x16_wt8x32x4_ws1x1_wr2x2_ta1x1x2x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s index 428d932bfc..1f9aa4948d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s index 9e93751b73..8e916e1184 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x8_wt16x64x4_ws1x1_wr1x1_ta1x1x1x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s index 253567308d..5153df632d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s index 19b38c36c4..3aa7c3816a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x128x8_wt8x32x4_ws1x1_wr2x2_ta1x1x1x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x1x2x1_1x16x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x1x2x1_1x16x1x8.s index 5d0f5eff3f..fa2de2d043 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x1x2x1_1x16x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x1x2x1_1x16x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x2x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x2x1x1_1x8x1x16.s index cbdff9d7a8..a5fb9edd05 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x2x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x16_tb1x2x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x16_wt32x8x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x1x4x1_1x16x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x1x4x1_1x16x1x4.s index b918ddcf36..5fd2da6a18 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x1x4x1_1x16x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x1x4x1_1x16x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -392,7 +392,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x4x1x1_1x4x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x4x1x1_1x4x1x16.s index 82c2b3cfb4..9a75370000 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x4x1x1_1x4x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x16_tb1x4x1x1_1x4x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -390,7 +390,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x16_wt32x8x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x8_wt32x8x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x16_tb1x1x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x8_wt32x8x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x16_tb1x1x1x1_1x8x1x16.s index b8dc052e79..94ce590371 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x8_wt32x8x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x16_tb1x1x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x8_wt32x8x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x16_tb1x1x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -383,7 +383,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x8_wt32x8x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x1x2x1_1x8x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x1x2x1_1x8x1x8.s index 1265a8bd7f..5b19a82bbd 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x1x2x1_1x8x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x1x2x1_1x8x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x2x1x1_1x4x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x2x1x1_1x4x1x16.s index f9e5b4a4fd..1debd887a0 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x2x1x1_1x4x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x16_tb1x2x1x1_1x4x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x16x8_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s index 2bb99ec156..8e67cdabff 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -412,7 +412,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s index 7e926c45bf..546ecf5726 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16x1x16_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -415,7 +415,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x256x16_wt4x64x4_ws2x1_wr2x2_ta1x1x2x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s index 9d7109bb4a..b19299c2ea 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -397,7 +397,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s index 7f85df6a0a..5440c07c1a 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -394,7 +394,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x256x8_wt4x64x4_ws2x1_wr2x2_ta1x1x1x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x2x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x2x1_1x16x1x16.s index 5fc773b4f4..b357182843 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x2x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x2x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -388,7 +388,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x2x1x1_1x8x1x32.s index bcd389bbbb..8e615ad5fe 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x2x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -386,7 +386,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x16_wt16x16x4_ws1x1_wr1x1_ta1x1x2x1_1x16 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x1x4x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x1x4x1_1x32x1x8.s index b2b758cf3e..4b1ee9c339 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x1x4x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x1x4x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -392,7 +392,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x4x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x4x1x1_1x8x1x32.s index e360323ea7..4ef18093b2 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x4x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x16x1x16_tb1x4x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -390,7 +390,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x32_wt16x16x16_ws1x1_wr1x1_ta1x2x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x1x8x1_1x64x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x1x8x1_1x64x1x4.s index e70efdb7b8..81d0c8d4b4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x1x8x1_1x64x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x1x8x1_1x64x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x8x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x8x1x1_1x8x1x32.s index f2f5c37b72..97f273de9d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x8x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x16x1x16_tb1x8x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x64_wt16x16x16_ws1x1_wr1x1_ta1x4x2x1_1x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s index 4adfc6d7a9..e8b32e0873 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -381,7 +381,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x32x8_wt16x16x4_ws1x1_wr1x1_ta1x1x1x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s index 23b5a1c9e0..92652cf4b7 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x1x4x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -391,7 +391,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s index 1c9482eea1..5030929aaa 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x1x16_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x64x16_wt8x32x4_ws2x1_wr1x1_ta1x1x2x1_1x16x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s index 3a1766a08d..bfd328c0c0 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -386,7 +386,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s index 9d47bfc8ba..4239828057 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -384,7 +384,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt32x64x8_wt8x32x4_ws2x1_wr1x1_ta1x1x1x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x16x1x1_1x1x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x16x1x1_1x1x1x64.s index 0440748258..fb3c8baaee 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x16x1x1_1x1x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x16x1x1_1x1x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -410,7 +410,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x1x16x1_1x16x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x1x16x1_1x16x1x4.s index a0bac5eb11..f28b8e6309 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x1x16x1_1x16x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x4_tb1x1x16x1_1x16x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -413,7 +413,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt4x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s index 6cdb3c9275..5cbab13c0d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s index 2aab6e1d90..4d1b0d8815 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt32x32x8_ws1x2_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s index b4f0d38280..1847ea9581 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s index 276f15cfd8..cc9112e472 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt32x64x4_ws1x1_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s index 538542e43d..2ff2e83d10 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s index 3b08afd216..7897b7e67b 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt64x32x4_ws1x1_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s index 7f07567367..99385da124 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x8x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s index 9fe0609bd3..f374214516 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x8x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x16_wt8x32x4_ws2x1_wr2x2_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x16x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x16x1x1_1x2x1x128.s index e0245be068..a03aee4fc1 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x16x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x16x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -414,7 +414,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x1x16x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x1x16x1_1x32x1x8.s index 88f8359788..c043132aad 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x1x16x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x1x16x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -416,7 +416,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x32_wt32x32x8_ws1x2_wr1x1_ta1x4x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s index 80a58c6ad0..2981f4f4e7 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -391,7 +391,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s index 2b07b82256..a85663b7e1 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x8_wt32x64x4_ws1x1_wr1x1_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s index 1005e7e9b4..a2d9a8a379 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -391,7 +391,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s index 65d21e0082..940e64ca26 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x8_wt64x32x4_ws1x1_wr1x1_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s index b1810fb065..894b7873eb 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x4x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -391,7 +391,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s index 55096b4ccf..96fd4b8912 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x4x1x1_1x2x1x128.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x128x8_wt8x32x4_ws2x1_wr2x2_ta1x1x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x1x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x1x1_1x16x1x16.s index 3e9bd75b8d..3b9eb76bb9 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x1x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x1x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -384,7 +384,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x1_wr1x1_ta1x2x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x1x2x1_1x16x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x1x2x1_1x16x1x8.s index 457452c0af..2f85ab0d2f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x1x2x1_1x16x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x1x2x1_1x16x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s index 1d6a98a6f1..16de9d1bf3 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1x32_tb1x2x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x16x16_wt64x4x4_ws1x2_wr1x1_ta1x4x2x1_1x4x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x16x8_wt64x4x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x32_tb1x1x1x1_1x8x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x16x8_wt64x4x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x32_tb1x1x1x1_1x8x1x16.s index 8b7c20ad3b..3b77c73852 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x16x8_wt64x4x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x32_tb1x1x1x1_1x8x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x16x8_wt64x4x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x32_tb1x1x1x1_1x8x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -384,7 +384,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x16x8_wt64x4x4_ws1x2_wr1x1_ta1x2x2x1_1x4x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s index 8747e5338f..5b74605840 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -413,7 +413,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s index 51642dc806..36206a3570 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -416,7 +416,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt16x64x4_ws1x1_wr2x2_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s index ce78d1ef74..5a87d51093 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -413,7 +413,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s index 8e9cfca2b4..ca1ee1bd54 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -416,7 +416,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x32x8_ws1x1_wr2x2_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s index 3a5d330733..3d520a9a4f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x16x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -413,7 +413,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s index 3bf8f1882e..4ccb4494cc 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x16x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -416,7 +416,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x16_wt32x64x4_ws1x2_wr1x1_ta1x2x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x1x32x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x1x32x1_1x32x1x8.s index 16c1718890..dbfa9a9ba8 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x1x32x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x1x32x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -448,7 +448,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x32x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x32x1x1_1x1x1x256.s index 89cf35bfd8..937e3625a4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x32x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x32x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -445,7 +445,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x32_wt32x32x8_ws1x1_wr2x2_ta1x4x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s index 07137b083e..c122db4e14 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -399,7 +399,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s index 5cb83e5c51..bbed6635af 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -396,7 +396,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x8_wt16x64x4_ws1x1_wr2x2_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s index 6d69aa88a7..e947e082ae 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x8x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -399,7 +399,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s index 93b6d2e396..80110b3ab9 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x8x1x1_1x1x1x256.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -396,7 +396,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x256x8_wt32x64x4_ws1x2_wr1x1_ta1x1x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x2x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x2x1_1x16x1x16.s index a9734db0e8..4ab8f3be21 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x2x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x1x2x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -389,7 +389,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s index dd4f3c2fb0..a3c1b9c1fc 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1x32_tb1x2x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -387,7 +387,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x32x16_wt32x8x4_ws1x2_wr1x1_ta1x2x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s index 6526100177..6ec580b2e4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x32_tb1x1x1x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -383,7 +383,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x32x8_wt32x8x4_ws1x2_wr1x1_ta1x1x2x1_1x8x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x32_tb1x1x1x1_1x16x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x32_tb1x1x1x1_1x16x1x4.s index 906339f387..cb5d3a0071 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x32_tb1x1x1x1_1x16x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x32_tb1x1x1x1_1x16x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -384,7 +384,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x4x16_wt64x4x4_ws1x1_wr1x1_ta1x8x2x1_1x2x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x4x1_1x16x1x16.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x4x1_1x16x1x16.s index 00e587d7a1..c4554530af 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x4x1_1x16x1x16.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x1x4x1_1x16x1x16.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -392,7 +392,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x4x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x4x1x1_1x4x1x64.s index 72159ca85f..5d7e6de063 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x4x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x1x32_tb1x4x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -390,7 +390,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x16_wt16x16x4_ws1x1_wr2x2_ta1x2x2x1_1x8x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x1x8x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x1x8x1_1x32x1x8.s index 22b20ac38d..8fdbb0506e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x1x8x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x1x8x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s index 4d43449b26..9ca5d6886d 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws1x1_wr2x2_ta1x4x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x1x8x1_1x32x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x1x8x1_1x32x1x8.s index 22d70f2414..ffe74bc2f7 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x1x8x1_1x32x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x1x8x1_1x32x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -400,7 +400,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s index 838cb57f20..9ca1c343f3 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8x1x32_tb1x8x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -398,7 +398,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x32_wt16x16x16_ws2x2_wr1x1_ta1x4x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s index 29b7fdb8e3..26c0eca93e 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -414,7 +414,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s index 05f7dfb5d8..53b88efcb4 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -416,7 +416,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x64_wt16x16x16_ws1x1_wr2x2_ta1x8x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s index dd8ba6eae4..ec34750ce5 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x16x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -414,7 +414,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s index 31e51fc37b..a8c8b2167c 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8x1x32_tb1x1x16x1_1x64x1x4.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -416,7 +416,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x64_wt16x16x16_ws2x2_wr1x1_ta1x8x2x1_1x8 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s index 9eb9e171a9..614807abd9 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x1x2x1_1x8x1x32.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -388,7 +388,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s index 590337b7d3..5ee7219ed9 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1x32_tb1x2x1x1_1x4x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -386,7 +386,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x64x8_wt16x16x4_ws1x1_wr2x2_ta1x1x2x1_1x8x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x8x16_wt64x4x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x1x1x1_1x16x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x8x16_wt64x4x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x1x1x1_1x16x1x8.s index 6a0d1c6cc3..7677d0907f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x8x16_wt64x4x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x1x1x1_1x16x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x8x16_wt64x4x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x32_tb1x1x1x1_1x16x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -384,7 +384,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt64x8x16_wt64x4x4_ws1x1_wr1x1_ta1x4x2x1_1x4x1x ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x1x8x1_1x16x1x8.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x1x8x1_1x16x1x8.s index a230e5547a..2aba5b100f 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x1x8x1_1x16x1x8.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x1x8x1_1x16x1x8.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -397,7 +397,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s index 039a4e3ab8..28863ce635 100644 --- a/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s +++ b/src/kernels/dynamic_igemm/igemm_gtc_fwd_fp16/igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1x8_tb1x8x1x1_1x2x1x64.s @@ -23,7 +23,7 @@ * SOFTWARE. * *******************************************************************************/ -; generated by igemm_codegen.py (036d632a36e6748e328a9ed796fefbdd3633c062) +; generated by igemm_codegen.py (c8c86649c68a788f18be9f7a599a555f08903048) ; .macro .mdiv_u32_ss s_quot s_numer s_magic s_shift s_tmp s_mul_hi_u32 s[\s_tmp], s[\s_magic], s[\s_numer] @@ -395,7 +395,6 @@ igemm_fwd_gtcx_nchw_fp16_bx4_ex1_bt8x64x16_wt4x64x4_ws1x1_wr1x1_ta1x1x1x1_1x16x1 ; config for weight range s_mul_i32 s[s_p_wei+2], s[s_wei_stride_k], s[s_k] - s_add_i32 s[s_p_wei+2], s[s_p_wei+2], 1 s_lshl_b32 s[s_p_wei+2], s[s_p_wei+2], 1 s_mov_b32 s[s_p_wei+3], 0x27000 ; calculate wei offset diff --git a/src/solver/conv_asm_implicit_gemm_gtc_fwd.cpp b/src/solver/conv_asm_implicit_gemm_gtc_fwd.cpp index 0f02507eee..f502183583 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_fwd.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_fwd.cpp @@ -1395,6 +1395,10 @@ static std::tuple 1)) + { + continue; + }; } // Don't have to check, assuming the tunable itself is already valid if(cfg.gemm_n_per_block % cfg.nxb != 0) @@ -1460,6 +1464,10 @@ static std::tuple 1)) + { + continue; + }; } // Don't have to check, assuming the tunable itself is already valid if(cfg.gemm_n_per_block % cfg.nxb != 0)