From 20150702e51c274ecff87744b4cbb7631d331a64 Mon Sep 17 00:00:00 2001
From: Lynn Boger <laboger@linux.vnet.ibm.com>
Date: Mon, 25 Sep 2023 10:49:11 -0500
Subject: [PATCH] crypto/subtle: improve xorBytes assembler on PPC64

This makes some improvements to the xorBytes assembler
implementation for PPC64 targets.

The loops to process large streams of bytes has been changed to
do 64 bytes at a time. Other changes were made to prevent
degradations in some of the common sizes like 8, 16.

The case for < 8 bytes on power10 has been modified to use
the LXVL and STXVL instructions.

Change-Id: I7477d12d5375d484af8c274443d595ccdafbda7c
Reviewed-on: https://go-review.googlesource.com/c/go/+/530877
Reviewed-by: Paul Murphy <murp@ibm.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Jayanth Krishnamurthy <jayanth.krishnamurthy@ibm.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Benny Siegert <bsiegert@gmail.com>
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
---
 src/crypto/subtle/xor_ppc64x.s | 171 ++++++++++++++++++++++-----------
 1 file changed, 113 insertions(+), 58 deletions(-)

diff --git a/src/crypto/subtle/xor_ppc64x.s b/src/crypto/subtle/xor_ppc64x.s
index 72bb80d24672be..0de4350cb23746 100644
--- a/src/crypto/subtle/xor_ppc64x.s
+++ b/src/crypto/subtle/xor_ppc64x.s
@@ -13,75 +13,130 @@ TEXT ·xorBytes(SB), NOSPLIT, $0
 	MOVD	b+16(FP), R5	// R5 = b
 	MOVD	n+24(FP), R6	// R6 = n
 
-	CMPU	R6, $32, CR7	// Check if n ≥ 32 bytes
+	CMPU	R6, $64, CR7	// Check if n ≥ 64 bytes
 	MOVD	R0, R8		// R8 = index
-	CMPU	R6, $8, CR6	// Check if 8 ≤ n < 32 bytes
-	BLT	CR6, small	// Smaller than 8
-	BLT	CR7, xor16	// Case for 16 ≤ n < 32 bytes
+	CMPU	R6, $8, CR6	// Check if 8 ≤ n < 64 bytes
+	BLE	CR6, small	// <= 8
+	BLT	CR7, xor32	// Case for 32 ≤ n < 64 bytes
 
-	// Case for n ≥ 32 bytes
-preloop32:
-	SRD	$5, R6, R7	// Setup loop counter
+	// Case for n ≥ 64 bytes
+preloop64:
+	SRD	$6, R6, R7	// Set up loop counter
 	MOVD	R7, CTR
 	MOVD	$16, R10
-	ANDCC	$31, R6, R9	// Check for tailing bytes for later
-loop32:
-	LXVD2X		(R4)(R8), VS32		// VS32 = a[i,...,i+15]
-	LXVD2X		(R4)(R10), VS34
-	LXVD2X		(R5)(R8), VS33		// VS33 = b[i,...,i+15]
-	LXVD2X		(R5)(R10), VS35
-	XXLXOR		VS32, VS33, VS32	// VS34 = a[] ^ b[]
-	XXLXOR		VS34, VS35, VS34
-	STXVD2X		VS32, (R3)(R8)		// Store to dst
-	STXVD2X		VS34, (R3)(R10)
-	ADD		$32, R8			// Update index
-	ADD		$32, R10
-	BC		16, 0, loop32		// bdnz loop16
-
-	BEQ		CR0, done
-
-	MOVD		R9, R6
-	CMP		R6, $8
-	BLT		small
+	MOVD	$32, R14
+	MOVD	$48, R15
+	ANDCC	$63, R6, R9	// Check for tailing bytes for later
+	PCALIGN $16
+	// Case for >= 64 bytes
+	// Process 64 bytes per iteration
+	// Load 4 vectors of a and b
+	// XOR the corresponding vectors
+	// from a and b and store the result
+loop64:
+	LXVD2X	(R4)(R8), VS32
+	LXVD2X	(R4)(R10), VS34
+	LXVD2X	(R4)(R14), VS36
+	LXVD2X	(R4)(R15), VS38
+	LXVD2X	(R5)(R8), VS33
+	LXVD2X	(R5)(R10), VS35
+	LXVD2X	(R5)(R14), VS37
+	LXVD2X	(R5)(R15), VS39
+	XXLXOR	VS32, VS33, VS32
+	XXLXOR	VS34, VS35, VS34
+	XXLXOR	VS36, VS37, VS36
+	XXLXOR	VS38, VS39, VS38
+	STXVD2X	VS32, (R3)(R8)
+	STXVD2X	VS34, (R3)(R10)
+	STXVD2X	VS36, (R3)(R14)
+	STXVD2X	VS38, (R3)(R15)
+	ADD	$64, R8
+	ADD	$64, R10
+	ADD	$64, R14
+	ADD	$64, R15
+	BDNZ	loop64
+	BC	12,2,LR		// BEQLR
+	MOVD	R9, R6
+	CMP	R6, $8
+	BLE	small
+	// Case for 8 <= n < 64 bytes
+	// Process 32 bytes if available
+xor32:
+	CMP	R6, $32
+	BLT	xor16
+	ADD	$16, R8, R9
+	LXVD2X	(R4)(R8), VS32
+	LXVD2X	(R4)(R9), VS33
+	LXVD2X	(R5)(R8), VS34
+	LXVD2X	(R5)(R9), VS35
+	XXLXOR	VS32, VS34, VS32
+	XXLXOR	VS33, VS35, VS33
+	STXVD2X	VS32, (R3)(R8)
+	STXVD2X	VS33, (R3)(R9)
+	ADD	$32, R8
+	ADD	$-32, R6
+	CMP	R6, $8
+	BLE	small
+	// Case for 8 <= n < 32 bytes
+	// Process 16 bytes if available
 xor16:
-	CMP		R6, $16
-	BLT		xor8
-	LXVD2X		(R4)(R8), VS32
-	LXVD2X		(R5)(R8), VS33
-	XXLXOR		VS32, VS33, VS32
-	STXVD2X		VS32, (R3)(R8)
-	ADD		$16, R8
-	ADD		$-16, R6
-	CMP		R6, $8
-	BLT		small
+	CMP	R6, $16
+	BLT	xor8
+	LXVD2X	(R4)(R8), VS32
+	LXVD2X	(R5)(R8), VS33
+	XXLXOR	VS32, VS33, VS32
+	STXVD2X	VS32, (R3)(R8)
+	ADD	$16, R8
+	ADD	$-16, R6
+small:
+	CMP	R6, R0
+	BC	12,2,LR		// BEQLR
 xor8:
-	// Case for 8 ≤ n < 16 bytes
-	MOVD    (R4)(R8), R14   // R14 = a[i,...,i+7]
-	MOVD    (R5)(R8), R15   // R15 = b[i,...,i+7]
-	XOR     R14, R15, R16   // R16 = a[] ^ b[]
-	SUB     $8, R6          // n = n - 8
-	MOVD    R16, (R3)(R8)   // Store to dst
-	ADD     $8, R8
-
-	// Check if we're finished
-	CMP     R6, R0
-	BGT     small
+#ifdef GOPPC64_power10
+	SLD	$56,R6,R17
+	ADD	R4,R8,R18
+	ADD	R5,R8,R19
+	ADD	R3,R8,R20
+	LXVL	R18,R17,V0
+	LXVL	R19,R17,V1
+	VXOR	V0,V1,V1
+	STXVL	V1,R20,R17
 	RET
-
-	// Case for n < 8 bytes and tailing bytes from the
-	// previous cases.
-small:
+#else
+	CMP	R6, $8
+	BLT	xor4
+	// Case for 8 ≤ n < 16 bytes
+	MOVD	(R4)(R8), R14   // R14 = a[i,...,i+7]
+	MOVD	(R5)(R8), R15   // R15 = b[i,...,i+7]
+	XOR	R14, R15, R16   // R16 = a[] ^ b[]
+	SUB	$8, R6          // n = n - 8
+	MOVD	R16, (R3)(R8)   // Store to dst
+	ADD	$8, R8
+xor4:
+	CMP	R6, $4
+	BLT	xor2
+	MOVWZ	(R4)(R8), R14
+	MOVWZ	(R5)(R8), R15
+	XOR	R14, R15, R16
+	MOVW	R16, (R3)(R8)
+	ADD	$4,R8
+	ADD	$-4,R6
+xor2:
+	CMP	R6, $2
+	BLT	xor1
+	MOVHZ	(R4)(R8), R14
+	MOVHZ	(R5)(R8), R15
+	XOR	R14, R15, R16
+	MOVH	R16, (R3)(R8)
+	ADD	$2,R8
+	ADD	$-2,R6
+xor1:
 	CMP	R6, R0
-	BEQ	done
-	MOVD	R6, CTR		// Setup loop counter
-
-loop:
+	BC	12,2,LR		// BEQLR
 	MOVBZ	(R4)(R8), R14	// R14 = a[i]
 	MOVBZ	(R5)(R8), R15	// R15 = b[i]
 	XOR	R14, R15, R16	// R16 = a[i] ^ b[i]
 	MOVB	R16, (R3)(R8)	// Store to dst
-	ADD	$1, R8
-	BC	16, 0, loop	// bdnz loop
-
+#endif
 done:
 	RET