@@ -2203,114 +2203,117 @@ void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t l
22032203// Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
22042204// Any remaining elements of dst will be filled with zero.
22052205// Clobbers: rscratch1
2206- // Preserves: src, mask
2206+ // Preserves: mask, vzr
22072207void C2_MacroAssembler::sve_compress_short (FloatRegister dst, FloatRegister src, PRegister mask,
2208- FloatRegister vtmp1 , FloatRegister vtmp2 ,
2209- PRegister pgtmp) {
2208+ FloatRegister vzr , FloatRegister vtmp ,
2209+ PRegister pgtmp, unsigned vector_length_in_bytes ) {
22102210 assert (pgtmp->is_governing (), " This register has to be a governing predicate register" );
2211- assert_different_registers (dst, src, vtmp1, vtmp2);
2211+ // When called by sve_compress_byte, src and vtmp may be the same register.
2212+ assert_different_registers (dst, src, vzr);
2213+ assert_different_registers (dst, vtmp, vzr);
22122214 assert_different_registers (mask, pgtmp);
2213-
2214- // Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111
2215- // mask = 0001 0000 0000 0001 0001 0000 0001 0001
2216- // Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111
2217- sve_dup (vtmp2, H, 0 );
2215+ // high <-- low
2216+ // Example input: src = hh gg ff ee dd cc bb aa, one character is 8 bits.
2217+ // mask = 01 00 00 01 01 00 01 01, one character is 1 bit.
2218+ // Expected result: dst = 00 00 00 hh ee dd bb aa
22182219
22192220 // Extend lowest half to type INT.
2220- // dst = 00004444 00003333 00002222 00001111
2221+ // dst = 00dd 00cc 00bb 00aa
22212222 sve_uunpklo (dst, S, src);
2222- // pgtmp = 00000001 00000000 00000001 00000001
2223+ // pgtmp = 0001 0000 0001 0001
22232224 sve_punpklo (pgtmp, mask);
22242225 // Pack the active elements in size of type INT to the right,
22252226 // and fill the remainings with zero.
2226- // dst = 00000000 00004444 00002222 00001111
2227+ // dst = 0000 00dd 00bb 00aa
22272228 sve_compact (dst, S, dst, pgtmp);
22282229 // Narrow the result back to type SHORT.
2229- // dst = 0000 0000 0000 0000 0000 4444 2222 1111
2230- sve_uzp1 (dst, H, dst, vtmp2);
2230+ // dst = 00 00 00 00 00 dd bb aa
2231+ sve_uzp1 (dst, H, dst, vzr);
2232+
2233+ // Return if the vector length is no more than MaxVectorSize/2, since the
2234+ // highest half is invalid.
2235+ if (vector_length_in_bytes <= (MaxVectorSize >> 1 )) {
2236+ return ;
2237+ }
2238+
22312239 // Count the active elements of lowest half.
22322240 // rscratch1 = 3
22332241 sve_cntp (rscratch1, S, ptrue, pgtmp);
22342242
22352243 // Repeat to the highest half.
2236- // pgtmp = 00000001 00000000 00000000 00000001
2244+ // pgtmp = 0001 0000 0000 0001
22372245 sve_punpkhi (pgtmp, mask);
2238- // vtmp1 = 00008888 00007777 00006666 00005555
2239- sve_uunpkhi (vtmp1, S, src);
2240- // vtmp1 = 00000000 00000000 00008888 00005555
2241- sve_compact (vtmp1, S, vtmp1, pgtmp);
2242- // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2243- sve_uzp1 (vtmp1, H, vtmp1, vtmp2);
2244-
2245- // Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111
2246- // Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2247- // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2248- // TRUE_CNT is the number of active elements in the compressed low.
2249- neg (rscratch1, rscratch1);
2250- // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2251- sve_index (vtmp2, H, rscratch1, 1 );
2252- // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
2253- sve_tbl (vtmp1, H, vtmp1, vtmp2);
2254-
2255- // Combine the compressed high(after shifted) with the compressed low.
2256- // dst = 0000 0000 0000 8888 5555 4444 2222 1111
2257- sve_orr (dst, dst, vtmp1);
2246+ // vtmp = 00hh 00gg 00ff 00ee
2247+ sve_uunpkhi (vtmp, S, src);
2248+ // vtmp = 0000 0000 00hh 00ee
2249+ sve_compact (vtmp, S, vtmp, pgtmp);
2250+ // vtmp = 00 00 00 00 00 00 hh ee
2251+ sve_uzp1 (vtmp, H, vtmp, vzr);
2252+
2253+ // pgtmp = 00 00 00 00 00 01 01 01
2254+ sve_whilelt (pgtmp, H, zr, rscratch1);
2255+ // Compressed low: dst = 00 00 00 00 00 dd bb aa
2256+ // Compressed high: vtmp = 00 00 00 00 00 00 hh ee
2257+ // Combine the compressed low with the compressed high:
2258+ // dst = 00 00 00 hh ee dd bb aa
2259+ sve_splice (dst, H, pgtmp, vtmp);
22582260}
22592261
22602262// Clobbers: rscratch1, rscratch2
22612263// Preserves: src, mask
22622264void C2_MacroAssembler::sve_compress_byte (FloatRegister dst, FloatRegister src, PRegister mask,
2263- FloatRegister vtmp1, FloatRegister vtmp2,
2264- FloatRegister vtmp3, FloatRegister vtmp4,
2265- PRegister ptmp, PRegister pgtmp) {
2265+ FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
2266+ PRegister ptmp, PRegister pgtmp, unsigned vector_length_in_bytes) {
22662267 assert (pgtmp->is_governing (), " This register has to be a governing predicate register" );
2267- assert_different_registers (dst, src, vtmp1, vtmp2, vtmp3, vtmp4 );
2268+ assert_different_registers (dst, src, vtmp1, vtmp2, vtmp3);
22682269 assert_different_registers (mask, ptmp, pgtmp);
2269- // Example input: src = 88 77 66 55 44 33 22 11
2270- // mask = 01 00 00 01 01 00 01 01
2271- // Expected result: dst = 00 00 00 88 55 44 22 11
2270+ // high <-- low
2271+ // Example input: src = q p n m l k j i h g f e d c b a, one character is 8 bits.
2272+ // mask = 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 1, one character is 1 bit.
2273+ // Expected result: dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2274+ FloatRegister vzr = vtmp3;
2275+ sve_dup (vzr, B, 0 );
22722276
2273- sve_dup (vtmp4, B, 0 );
22742277 // Extend lowest half to type SHORT.
2275- // vtmp1 = 0044 0033 0022 0011
2278+ // vtmp1 = 0h 0g 0f 0e 0d 0c 0b 0a
22762279 sve_uunpklo (vtmp1, H, src);
2277- // ptmp = 0001 0000 0001 0001
2280+ // ptmp = 00 01 00 00 00 01 00 01
22782281 sve_punpklo (ptmp, mask);
2279- // Count the active elements of lowest half.
2280- // rscratch2 = 3
2281- sve_cntp (rscratch2, H, ptrue, ptmp);
22822282 // Pack the active elements in size of type SHORT to the right,
22832283 // and fill the remainings with zero.
2284- // dst = 0000 0044 0022 0011
2285- sve_compress_short (dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
2284+ // dst = 00 00 00 00 00 0g 0c 0a
2285+ unsigned extended_size = vector_length_in_bytes << 1 ;
2286+ sve_compress_short (dst, vtmp1, ptmp, vzr, vtmp2, pgtmp, extended_size > MaxVectorSize ? MaxVectorSize : extended_size);
22862287 // Narrow the result back to type BYTE.
2287- // dst = 00 00 00 00 00 44 22 11
2288- sve_uzp1 (dst, B, dst, vtmp4);
2288+ // dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2289+ sve_uzp1 (dst, B, dst, vzr);
2290+
2291+ // Return if the vector length is no more than MaxVectorSize/2, since the
2292+ // highest half is invalid.
2293+ if (vector_length_in_bytes <= (MaxVectorSize >> 1 )) {
2294+ return ;
2295+ }
2296+ // Count the active elements of lowest half.
2297+ // rscratch2 = 3
2298+ sve_cntp (rscratch2, H, ptrue, ptmp);
22892299
22902300 // Repeat to the highest half.
2291- // ptmp = 0001 0000 0000 0001
2301+ // ptmp = 00 01 00 00 00 00 00 01
22922302 sve_punpkhi (ptmp, mask);
2293- // vtmp1 = 0088 0077 0066 0055
2303+ // vtmp2 = 0q 0p 0n 0m 0l 0k 0j 0i
22942304 sve_uunpkhi (vtmp2, H, src);
2295- // vtmp1 = 0000 0000 0088 0055
2296- sve_compress_short (vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
2297-
2298- sve_dup (vtmp4, B, 0 );
2299- // vtmp1 = 00 00 00 00 00 00 88 55
2300- sve_uzp1 (vtmp1, B, vtmp1, vtmp4);
2301-
2302- // Compressed low: dst = 00 00 00 00 00 44 22 11
2303- // Compressed high: vtmp1 = 00 00 00 00 00 00 88 55
2304- // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2305- // TRUE_CNT is the number of active elements in the compressed low.
2306- neg (rscratch2, rscratch2);
2307- // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2308- sve_index (vtmp2, B, rscratch2, 1 );
2309- // vtmp1 = 00 00 00 88 55 00 00 00
2310- sve_tbl (vtmp1, B, vtmp1, vtmp2);
2311- // Combine the compressed high(after shifted) with the compressed low.
2312- // dst = 00 00 00 88 55 44 22 11
2313- sve_orr (dst, dst, vtmp1);
2305+ // vtmp1 = 00 00 00 00 00 00 0p 0i
2306+ sve_compress_short (vtmp1, vtmp2, ptmp, vzr, vtmp2, pgtmp, extended_size - MaxVectorSize);
2307+ // vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2308+ sve_uzp1 (vtmp1, B, vtmp1, vzr);
2309+
2310+ // ptmp = 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
2311+ sve_whilelt (ptmp, B, zr, rscratch2);
2312+ // Compressed low: dst = 0 0 0 0 0 0 0 0 0 0 0 0 0 g c a
2313+ // Compressed high: vtmp1 = 0 0 0 0 0 0 0 0 0 0 0 0 0 0 p i
2314+ // Combine the compressed low with the compressed high:
2315+ // dst = 0 0 0 0 0 0 0 0 0 0 0 p i g c a
2316+ sve_splice (dst, B, ptmp, vtmp1);
23142317}
23152318
23162319void C2_MacroAssembler::neon_reverse_bits (FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
0 commit comments