Reshuffle shift and rotate patterns for APX

H. Peter Anvin (Intel) · H. Peter Anvin (Intel) · commit e3f26e25a20b · 2025-10-07T16:26:09.000-07:00
The shift and rotate patterns are "interesting" in the following way: 1. Even though only 4/5/6 bits of the input are ever used, for the regular instructions the input is specified as the CL register, but for the -X instructions as a size-matching register. This makes the optimization patterns "interesting." 2. The sequencing of legacy, VEX -X versions, APX EVEX, and APX -X For #1, allow any size register to contain the shift count. For #2, split up the macro generation of the patterns, and add a new "$xmacro" macro to deal with the combinatorics of generating all the -X patterns. Written directly in Perl since it seemed easier than trying to make anything more general for what is very much a special case... Reported-by: Maciej Wieczor-Retman <maciej.wieczor-retman@intel.com> Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
diff --git a/x86/insns.dat b/x86/insns.dat
@@ -80,19 +80,12 @@ $bwdq TEST	rm#,imm#			[mi:	o# f6# /0 i#		]		8086,SM
 ;# The basic shift and rotate operations
 $shift		ROL ROR RCL RCR SHL,SAL SHR - SAR
 
-$dq   RORX	reg#,rm#*,imm8			[rmi:	vex+.lz.f2.0f3a.w# f0 /r ib]		BMI2,SM0-1
-$dq   ROLX	reg#,rm#*,imm_known8		[rmi:	vex+.lz.f2.0f3a.w# f0 /r ib^(d:1f/3f)]	BMI2,SM0-1
-$dq   SHLX	reg#,rm#*,reg#			[rmv:	vex+.lz.66.0f38.w# f7 /r]		BMI2,SM
-$dq   SALX	reg#,rm#*,reg#			[rmv:	vex+.lz.66.0f38.w# f7 /r]		BMI2,SM,ND
-$dq   SARX	reg#,rm#*,reg#			[rmv:	vex+.lz.f3.0f38.w# f7 /r]		BMI2,SM
-$dq   SHRX	reg#,rm#*,reg#			[rmv:	vex+.lz.f2.0f38.w# f7 /r]		BMI2,SM
-
-$dq   ROR	reg#,rm#,imm8			[rmi:	vex+.lz.f2.0f3a.w# f0 /r ib]		BMI2,SM0-1,ND,NF!,OPT
-$dq   ROL	reg#,rm#*,imm_known8		[rmi:	vex+.lz.f2.0f3a.w# f0 /r ib^(d:1f/3f)]	BMI2,SM0-1,ND,NF!,OPT
-$dq   SHL	reg#,rm#*,reg#			[rmv:	vex+.lz.66.0f38.w# f7 /r]		BMI2,ND,NF!,OPT
-$dq   SAL	reg#,rm#*,reg#			[rmv:	vex+.lz.66.0f38.w# f7 /r]		BMI2,ND,NF!,OPT
-$dq   SAR	reg#,rm#*,reg#			[rmv:	vex+.lz.f3.0f38.w# f7 /r]		BMI2,ND,NF!,OPT
-$dq   SHR	reg#,rm#*,reg#			[rmv:	vex+.lz.f2.0f38.w# f7 /r]		BMI2,ND,NF!,OPT
+; RORX, SHLX, SARX
+$xshift evex=0
+
+;# APX EVEX versions
+$eshift		ROL ROR RCL RCR SHL,SAL SHR - SAR
+$xshift evex=1
 
 ;# Other basic integer arithmetic
 $wd   INC	reg#				[r:	o# 40+r]				8086,NOLONG
diff --git a/x86/preinsns.pl b/x86/preinsns.pl
@@ -35,18 +35,64 @@
 };
 
 # Common pattern for the basic shift and rotate instructions
+# Separate legacy and EVEX versions because additional patterns are
+# needed to handle the -X VEX versions
 $macros{'shift'} = {
     'def' => *def_eightfold,
 	'txt' => <<'EOL'
 $$bwdq $op	rm#,unity			[m-:	o# d0# /$n]				]	8086,FL
 $$bwdq $op	rm#,reg_cl			[m-:	o# d2# /$n]				]	8086,FL
+$$bwdq $op	rm#,reg_cx			[m-:	o# d2# /$n]				]	8086,FL,ND
+$$bwdq $op	rm#,reg_ecx			[m-:	o# d2# /$n]				]	8086,FL,ND
+$$bwdq $op	rm#,reg_rcx			[m-:	o# d2# /$n]				]	8086,FL,ND
 $$bwdq $op	rm#,imm8			[mi:	o# c0# /$n ib,u]			]	186,FL
+EOL
+};
+
+# APX EVEX versions
+$macros{'eshift'} = {
+    'def' => *def_eightfold,
+	'txt' => <<'EOL'
 $$bwdq $op	reg#?,rm#,unity			[vm-:	evex.ndx.nf.l0.m4.o#  d0# /$n		]	$apx,FL,SM0-1
 $$bwdq $op	reg#?,rm#,reg_cl		[vm-:	evex.ndx.nf.l0.m4.o#  d2# /$n		]	$apx,FL,SM0-1
+$$bwdq $op	reg#?,rm#,reg_cx		[vm-:	evex.ndx.nf.l0.m4.o#  d2# /$n		]	$apx,FL,SM0-1,ND
+$$bwdq $op	reg#?,rm#,reg_ecx		[vm-:	evex.ndx.nf.l0.m4.o#  d2# /$n		]	$apx,FL,SM0-1,ND
+$$bwdq $op	reg#?,rm#,reg_rcx		[vm-:	evex.ndx.nf.l0.m4.o#  d2# /$n		]	$apx,FL,SM0-1,ND
 $$bwdq $op	reg#?,rm#,imm8			[vmi:	evex.ndx.nf.l0.m4.o#  c0# /$n ib,u	]	$apx,FL,SM0-1
 EOL
 };
 
+# -X shifts
+$macros{'xshift'} = {
+    'func' => sub {
+	my($mac, $args, $rawargs) = @_;
+	my @ol;
+	my $vex = 'vex';
+	my $vfl = '';
+	if (grep { /^evex=1$/ } @$rawargs) {
+	    $vex = 'evex';
+	    $vfl = 'APX';
+	}
+	foreach my $xf (['X',"$vfl"], ['', "$vfl,ND,NF!,OPT"]) {
+	    my($x,$fl) = @$xf;
+	    foreach my $os (32, 64) {
+		my $w = ($os eq 32) ? 'w0' : 'w1';
+		my $ixor = sprintf('%02x', $os-1);
+		push(@ol, "ROR$x reg$os,rm$os,imm8       [rmi: $vex.lz.f2.0f3a.$w f0 /r ib] BMI2,SM0-1,!FL,$fl");
+		push(@ol, "ROL$x reg$os,rm$os,imm_known8 [rmi: $vex.lz.f2.0f3a.$w f0 /r ib^$ixor] BMI2,SM0-1,!FL,$fl");
+		foreach my $ss (8, 16, 32, 64) {
+		    foreach my $opp (['SHL','66'], ['SAL','66'], ['SAR','f3'], ['SHR','f2']) {
+			my($op,$pp) = @$opp;
+			my $ndss = ',ND' unless ($ss == $os && $op ne 'SAR');
+			push(@ol, "$op$x reg$os,rm${os}*,reg$ss [rmv: $vex.lz.$pp.0f38.$w f7 /r] BMI2,SM0-1,!FL,$fl,$ndss");
+		    }
+		}
+	    }
+	}
+	return @ol;
+    }
+};
+
 #
 # Common pattern for multiple 32/64, 16/32/64, or 8/16/32/64 instructions.
 # 'z' is used for a null-prefixed default-sized instruction (osm/osd)
@@ -433,7 +479,8 @@ (@)
 ## XXX: check: CMPSS, CMPSD
 ## XXX: check VEX encoded instructions that do not write
 
-# Instructions which (possibly) change the flags
+# Instructions which (possibly) change the flags without annotations
+# The FL or !FL flags will override this
 my $flaggy = '^(aa[adms]|ad[dc]|ad[co]x|aes\w*kl|and|andn|arpl|bextr|bl[sc]ic?|bl[sc]msk|bl[sc]r|\
 bs[rf]|bt|bt[crs]|bzhi|clac|clc|cld|cli|clrssbsy|cmc|cmp|cmpxchg.*|da[as]|dec|div|\
 encodekey.*|enqcmd.*|fu?comip?|idiv|imul|inc|iret.*|kortest.*|ktest.*|lar|loadiwkey|\