@@ -889,21 +889,18 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
889889; CHECK-SKX-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1
890890; CHECK-SKX-VBMI-NEXT: vmovdqa (%rsi), %ymm2
891891; CHECK-SKX-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3
892- ; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
893- ; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
894- ; CHECK-SKX-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4
895- ; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
896- ; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
897- ; CHECK-SKX-VBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm1
898- ; CHECK-SKX-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62]
899- ; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm1
900- ; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
901- ; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
902- ; CHECK-SKX-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4
903- ; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
904- ; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
905- ; CHECK-SKX-VBMI-NEXT: vpmullw %ymm2, %ymm0, %ymm0
906- ; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm0
892+ ; CHECK-SKX-VBMI-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
893+ ; CHECK-SKX-VBMI-NEXT: vpandn %ymm3, %ymm4, %ymm5
894+ ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5
895+ ; CHECK-SKX-VBMI-NEXT: vpand %ymm3, %ymm4, %ymm3
896+ ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1
897+ ; CHECK-SKX-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,2,34,4,36,6,38,8,40,10,42,12,44,14,46,16,48,18,50,20,52,22,54,24,56,26,58,28,60,30,62]
898+ ; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm1
899+ ; CHECK-SKX-VBMI-NEXT: vpandn %ymm2, %ymm4, %ymm5
900+ ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5
901+ ; CHECK-SKX-VBMI-NEXT: vpand %ymm2, %ymm4, %ymm2
902+ ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0
903+ ; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm0
907904; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm0, (%rdx)
908905; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm1, 32(%rdx)
909906; CHECK-SKX-VBMI-NEXT: vzeroupper
@@ -915,25 +912,19 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
915912; CHECK-AVX512-NEXT: vmovdqa 32(%rdi), %ymm1
916913; CHECK-AVX512-NEXT: vmovdqa (%rsi), %ymm2
917914; CHECK-AVX512-NEXT: vmovdqa 32(%rsi), %ymm3
918- ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
919- ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
920- ; CHECK-AVX512-NEXT: vpmullw %ymm4, %ymm5, %ymm4
921- ; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
922- ; CHECK-AVX512-NEXT: vpand %ymm5, %ymm4, %ymm4
923- ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
924- ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
925- ; CHECK-AVX512-NEXT: vpmullw %ymm3, %ymm1, %ymm1
926- ; CHECK-AVX512-NEXT: vpand %ymm5, %ymm1, %ymm1
927- ; CHECK-AVX512-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
928- ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
929- ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
930- ; CHECK-AVX512-NEXT: vpmullw %ymm3, %ymm4, %ymm3
931- ; CHECK-AVX512-NEXT: vpand %ymm5, %ymm3, %ymm3
932- ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
933- ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
934- ; CHECK-AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0
935- ; CHECK-AVX512-NEXT: vpand %ymm5, %ymm0, %ymm0
936- ; CHECK-AVX512-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
915+ ; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
916+ ; CHECK-AVX512-NEXT: vpand %ymm3, %ymm4, %ymm5
917+ ; CHECK-AVX512-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5
918+ ; CHECK-AVX512-NEXT: vpandn %ymm3, %ymm4, %ymm3
919+ ; CHECK-AVX512-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1
920+ ; CHECK-AVX512-NEXT: vpsllw $8, %ymm1, %ymm1
921+ ; CHECK-AVX512-NEXT: vpternlogq $248, %ymm4, %ymm5, %ymm1
922+ ; CHECK-AVX512-NEXT: vpand %ymm2, %ymm4, %ymm3
923+ ; CHECK-AVX512-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3
924+ ; CHECK-AVX512-NEXT: vpandn %ymm2, %ymm4, %ymm2
925+ ; CHECK-AVX512-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0
926+ ; CHECK-AVX512-NEXT: vpsllw $8, %ymm0, %ymm0
927+ ; CHECK-AVX512-NEXT: vpternlogq $248, %ymm4, %ymm3, %ymm0
937928; CHECK-AVX512-NEXT: vmovdqa %ymm0, (%rdx)
938929; CHECK-AVX512-NEXT: vmovdqa %ymm1, 32(%rdx)
939930; CHECK-AVX512-NEXT: vzeroupper
@@ -945,21 +936,18 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
945936; CHECK-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1
946937; CHECK-VBMI-NEXT: vmovdqa (%rsi), %ymm2
947938; CHECK-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3
948- ; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
949- ; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
950- ; CHECK-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4
951- ; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
952- ; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
953- ; CHECK-VBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm1
954- ; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62]
955- ; CHECK-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm1
956- ; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
957- ; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
958- ; CHECK-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4
959- ; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
960- ; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
961- ; CHECK-VBMI-NEXT: vpmullw %ymm2, %ymm0, %ymm0
962- ; CHECK-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm0
939+ ; CHECK-VBMI-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
940+ ; CHECK-VBMI-NEXT: vpandn %ymm3, %ymm4, %ymm5
941+ ; CHECK-VBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5
942+ ; CHECK-VBMI-NEXT: vpand %ymm3, %ymm4, %ymm3
943+ ; CHECK-VBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1
944+ ; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,2,34,4,36,6,38,8,40,10,42,12,44,14,46,16,48,18,50,20,52,22,54,24,56,26,58,28,60,30,62]
945+ ; CHECK-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm1
946+ ; CHECK-VBMI-NEXT: vpandn %ymm2, %ymm4, %ymm5
947+ ; CHECK-VBMI-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5
948+ ; CHECK-VBMI-NEXT: vpand %ymm2, %ymm4, %ymm2
949+ ; CHECK-VBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0
950+ ; CHECK-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm0
963951; CHECK-VBMI-NEXT: vmovdqa %ymm0, (%rdx)
964952; CHECK-VBMI-NEXT: vmovdqa %ymm1, 32(%rdx)
965953; CHECK-VBMI-NEXT: vzeroupper
@@ -976,14 +964,13 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
976964; CHECK-SKX-VBMI: # %bb.0:
977965; CHECK-SKX-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
978966; CHECK-SKX-VBMI-NEXT: vmovdqa64 (%rsi), %zmm1
979- ; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
980- ; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
981- ; CHECK-SKX-VBMI-NEXT: vpmullw %zmm2, %zmm3, %zmm2
982- ; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
983- ; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
984- ; CHECK-SKX-VBMI-NEXT: vpmullw %zmm1, %zmm0, %zmm0
985- ; CHECK-SKX-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126]
986- ; CHECK-SKX-VBMI-NEXT: vpermi2b %zmm2, %zmm0, %zmm1
967+ ; CHECK-SKX-VBMI-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
968+ ; CHECK-SKX-VBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm3
969+ ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3
970+ ; CHECK-SKX-VBMI-NEXT: vpandq %zmm1, %zmm2, %zmm1
971+ ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0
972+ ; CHECK-SKX-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,64,2,66,4,68,6,70,8,72,10,74,12,76,14,78,16,80,18,82,20,84,22,86,24,88,26,90,28,92,30,94,32,96,34,98,36,100,38,102,40,104,42,106,44,108,46,110,48,112,50,114,52,116,54,118,56,120,58,122,60,124,62,126]
973+ ; CHECK-SKX-VBMI-NEXT: vpermi2b %zmm3, %zmm0, %zmm1
987974; CHECK-SKX-VBMI-NEXT: vmovdqa64 %zmm1, (%rdx)
988975; CHECK-SKX-VBMI-NEXT: vzeroupper
989976; CHECK-SKX-VBMI-NEXT: retq
@@ -992,16 +979,13 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
992979; CHECK-AVX512: # %bb.0:
993980; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
994981; CHECK-AVX512-NEXT: vmovdqa64 (%rsi), %zmm1
995- ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
996- ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
997- ; CHECK-AVX512-NEXT: vpmullw %zmm2, %zmm3, %zmm2
998- ; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
999- ; CHECK-AVX512-NEXT: vpandq %zmm3, %zmm2, %zmm2
1000- ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
1001- ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
1002- ; CHECK-AVX512-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1003- ; CHECK-AVX512-NEXT: vpandq %zmm3, %zmm0, %zmm0
1004- ; CHECK-AVX512-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
982+ ; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
983+ ; CHECK-AVX512-NEXT: vpandq %zmm1, %zmm2, %zmm3
984+ ; CHECK-AVX512-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3
985+ ; CHECK-AVX512-NEXT: vpandnq %zmm1, %zmm2, %zmm1
986+ ; CHECK-AVX512-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0
987+ ; CHECK-AVX512-NEXT: vpsllw $8, %zmm0, %zmm0
988+ ; CHECK-AVX512-NEXT: vpternlogq $248, %zmm2, %zmm3, %zmm0
1005989; CHECK-AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
1006990; CHECK-AVX512-NEXT: vzeroupper
1007991; CHECK-AVX512-NEXT: retq
@@ -1010,14 +994,13 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
1010994; CHECK-VBMI: # %bb.0:
1011995; CHECK-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
1012996; CHECK-VBMI-NEXT: vmovdqa64 (%rsi), %zmm1
1013- ; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
1014- ; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
1015- ; CHECK-VBMI-NEXT: vpmullw %zmm2, %zmm3, %zmm2
1016- ; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
1017- ; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
1018- ; CHECK-VBMI-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1019- ; CHECK-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126]
1020- ; CHECK-VBMI-NEXT: vpermi2b %zmm2, %zmm0, %zmm1
997+ ; CHECK-VBMI-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
998+ ; CHECK-VBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm3
999+ ; CHECK-VBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3
1000+ ; CHECK-VBMI-NEXT: vpandq %zmm1, %zmm2, %zmm1
1001+ ; CHECK-VBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0
1002+ ; CHECK-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,64,2,66,4,68,6,70,8,72,10,74,12,76,14,78,16,80,18,82,20,84,22,86,24,88,26,90,28,92,30,94,32,96,34,98,36,100,38,102,40,104,42,106,44,108,46,110,48,112,50,114,52,116,54,118,56,120,58,122,60,124,62,126]
1003+ ; CHECK-VBMI-NEXT: vpermi2b %zmm3, %zmm0, %zmm1
10211004; CHECK-VBMI-NEXT: vmovdqa64 %zmm1, (%rdx)
10221005; CHECK-VBMI-NEXT: vzeroupper
10231006; CHECK-VBMI-NEXT: retq
0 commit comments