Skip to content

Commit

Permalink
crypto/subtle: improve xorBytes assembler on PPC64
Browse files Browse the repository at this point in the history
This makes some improvements to the xorBytes assembler
implementation for PPC64 targets.

The loops to process large streams of bytes has been changed to
do 64 bytes at a time. Other changes were made to prevent
degradations in some of the common sizes like 8, 16.

The case for < 8 bytes on power10 has been modified to use
the LXVL and STXVL instructions.

Change-Id: I7477d12d5375d484af8c274443d595ccdafbda7c
Reviewed-on: https://go-review.googlesource.com/c/go/+/530877
Reviewed-by: Paul Murphy <[email protected]>
TryBot-Result: Gopher Robot <[email protected]>
Reviewed-by: Jayanth Krishnamurthy <[email protected]>
LUCI-TryBot-Result: Go LUCI <[email protected]>
Reviewed-by: Michael Pratt <[email protected]>
Reviewed-by: Benny Siegert <[email protected]>
Run-TryBot: Lynn Boger <[email protected]>
  • Loading branch information
laboger authored and pull[bot] committed Feb 1, 2024
1 parent f1215c5 commit 2015070
Showing 1 changed file with 113 additions and 58 deletions.
171 changes: 113 additions & 58 deletions src/crypto/subtle/xor_ppc64x.s
Original file line number Diff line number Diff line change
Expand Up @@ -13,75 +13,130 @@ TEXT ·xorBytes(SB), NOSPLIT, $0
MOVD b+16(FP), R5 // R5 = b
MOVD n+24(FP), R6 // R6 = n

CMPU R6, $32, CR7 // Check if n ≥ 32 bytes
CMPU R6, $64, CR7 // Check if n ≥ 64 bytes
MOVD R0, R8 // R8 = index
CMPU R6, $8, CR6 // Check if 8 ≤ n < 32 bytes
BLT CR6, small // Smaller than 8
BLT CR7, xor16 // Case for 16 ≤ n < 32 bytes
CMPU R6, $8, CR6 // Check if 8 ≤ n < 64 bytes
BLE CR6, small // <= 8
BLT CR7, xor32 // Case for 32 ≤ n < 64 bytes

// Case for n ≥ 32 bytes
preloop32:
SRD $5, R6, R7 // Setup loop counter
// Case for n ≥ 64 bytes
preloop64:
SRD $6, R6, R7 // Set up loop counter
MOVD R7, CTR
MOVD $16, R10
ANDCC $31, R6, R9 // Check for tailing bytes for later
loop32:
LXVD2X (R4)(R8), VS32 // VS32 = a[i,...,i+15]
LXVD2X (R4)(R10), VS34
LXVD2X (R5)(R8), VS33 // VS33 = b[i,...,i+15]
LXVD2X (R5)(R10), VS35
XXLXOR VS32, VS33, VS32 // VS34 = a[] ^ b[]
XXLXOR VS34, VS35, VS34
STXVD2X VS32, (R3)(R8) // Store to dst
STXVD2X VS34, (R3)(R10)
ADD $32, R8 // Update index
ADD $32, R10
BC 16, 0, loop32 // bdnz loop16

BEQ CR0, done

MOVD R9, R6
CMP R6, $8
BLT small
MOVD $32, R14
MOVD $48, R15
ANDCC $63, R6, R9 // Check for tailing bytes for later
PCALIGN $16
// Case for >= 64 bytes
// Process 64 bytes per iteration
// Load 4 vectors of a and b
// XOR the corresponding vectors
// from a and b and store the result
loop64:
LXVD2X (R4)(R8), VS32
LXVD2X (R4)(R10), VS34
LXVD2X (R4)(R14), VS36
LXVD2X (R4)(R15), VS38
LXVD2X (R5)(R8), VS33
LXVD2X (R5)(R10), VS35
LXVD2X (R5)(R14), VS37
LXVD2X (R5)(R15), VS39
XXLXOR VS32, VS33, VS32
XXLXOR VS34, VS35, VS34
XXLXOR VS36, VS37, VS36
XXLXOR VS38, VS39, VS38
STXVD2X VS32, (R3)(R8)
STXVD2X VS34, (R3)(R10)
STXVD2X VS36, (R3)(R14)
STXVD2X VS38, (R3)(R15)
ADD $64, R8
ADD $64, R10
ADD $64, R14
ADD $64, R15
BDNZ loop64
BC 12,2,LR // BEQLR
MOVD R9, R6
CMP R6, $8
BLE small
// Case for 8 <= n < 64 bytes
// Process 32 bytes if available
xor32:
CMP R6, $32
BLT xor16
ADD $16, R8, R9
LXVD2X (R4)(R8), VS32
LXVD2X (R4)(R9), VS33
LXVD2X (R5)(R8), VS34
LXVD2X (R5)(R9), VS35
XXLXOR VS32, VS34, VS32
XXLXOR VS33, VS35, VS33
STXVD2X VS32, (R3)(R8)
STXVD2X VS33, (R3)(R9)
ADD $32, R8
ADD $-32, R6
CMP R6, $8
BLE small
// Case for 8 <= n < 32 bytes
// Process 16 bytes if available
xor16:
CMP R6, $16
BLT xor8
LXVD2X (R4)(R8), VS32
LXVD2X (R5)(R8), VS33
XXLXOR VS32, VS33, VS32
STXVD2X VS32, (R3)(R8)
ADD $16, R8
ADD $-16, R6
CMP R6, $8
BLT small
CMP R6, $16
BLT xor8
LXVD2X (R4)(R8), VS32
LXVD2X (R5)(R8), VS33
XXLXOR VS32, VS33, VS32
STXVD2X VS32, (R3)(R8)
ADD $16, R8
ADD $-16, R6
small:
CMP R6, R0
BC 12,2,LR // BEQLR
xor8:
// Case for 8 ≤ n < 16 bytes
MOVD (R4)(R8), R14 // R14 = a[i,...,i+7]
MOVD (R5)(R8), R15 // R15 = b[i,...,i+7]
XOR R14, R15, R16 // R16 = a[] ^ b[]
SUB $8, R6 // n = n - 8
MOVD R16, (R3)(R8) // Store to dst
ADD $8, R8

// Check if we're finished
CMP R6, R0
BGT small
#ifdef GOPPC64_power10
SLD $56,R6,R17
ADD R4,R8,R18
ADD R5,R8,R19
ADD R3,R8,R20
LXVL R18,R17,V0
LXVL R19,R17,V1
VXOR V0,V1,V1
STXVL V1,R20,R17
RET

// Case for n < 8 bytes and tailing bytes from the
// previous cases.
small:
#else
CMP R6, $8
BLT xor4
// Case for 8 ≤ n < 16 bytes
MOVD (R4)(R8), R14 // R14 = a[i,...,i+7]
MOVD (R5)(R8), R15 // R15 = b[i,...,i+7]
XOR R14, R15, R16 // R16 = a[] ^ b[]
SUB $8, R6 // n = n - 8
MOVD R16, (R3)(R8) // Store to dst
ADD $8, R8
xor4:
CMP R6, $4
BLT xor2
MOVWZ (R4)(R8), R14
MOVWZ (R5)(R8), R15
XOR R14, R15, R16
MOVW R16, (R3)(R8)
ADD $4,R8
ADD $-4,R6
xor2:
CMP R6, $2
BLT xor1
MOVHZ (R4)(R8), R14
MOVHZ (R5)(R8), R15
XOR R14, R15, R16
MOVH R16, (R3)(R8)
ADD $2,R8
ADD $-2,R6
xor1:
CMP R6, R0
BEQ done
MOVD R6, CTR // Setup loop counter

loop:
BC 12,2,LR // BEQLR
MOVBZ (R4)(R8), R14 // R14 = a[i]
MOVBZ (R5)(R8), R15 // R15 = b[i]
XOR R14, R15, R16 // R16 = a[i] ^ b[i]
MOVB R16, (R3)(R8) // Store to dst
ADD $1, R8
BC 16, 0, loop // bdnz loop

#endif
done:
RET

0 comments on commit 2015070

Please sign in to comment.