Skip to content

Commit

Permalink
crypto/sha512: optimize ARM64 sha512 implemention
Browse files Browse the repository at this point in the history
This CL enable sha512 for arm64 and ~390% performance
improvement.

Contributed under the Go License with permission of
Linaro by Carlos Eduardo Seo <[email protected]>

https://perf.golang.org/search?q=upload:20220526.18

Hash8Bytes/New		16.0MB/s ± 0%	61.3MB/s ± 0%	+283.97% (p=0.000 n=9+9)
Hash8Bytes/Sum384	16.4MB/s ± 0%	64.8MB/s ± 0%	+295.31% (p=0.000 n=8+9)
Hash8Bytes/Sum512	16.3MB/s ± 0%	64.2MB/s ± 0%	+293.37% (p=0.000 n=10+10)
Hash1K/New		252MB/s ± 0%	1217MB/s ± 0%	+383.00% (p=0.000 n=9+10)
Hash1K/Sum384		253MB/s ± 0%	1237MB/s ± 0%	+389.25% (p=0.000 n=10+10)
Hash1K/Sum512		253MB/s ± 0%	1231MB/s ± 0%	+387.37% (p=0.000 n=10+8)
Hash8K/New		284MB/s ± 0%	1405MB/s ± 2%	+395.19% (p=0.000 n=9+8)
Hash8K/Sum384		284MB/s ± 0%	1413MB/s ± 0%	+397.76% (p=0.000 n=10+8)
Hash8K/Sum512		284MB/s ± 0%	1411MB/s ± 0%	+397.19% (p=0.000 n=10+10)

Change-Id: I4476da23d8cd376bf1f75d946d6b0c58470df1b8
Reviewed-on: https://go-review.googlesource.com/c/go/+/180257
Reviewed-by: Carlos Eduardo Seo <[email protected]>
Reviewed-by: Ard Biesheuvel <[email protected]>
Reviewed-by: Heschi Kreinick <[email protected]>
Reviewed-by: Filippo Valsorda <[email protected]>
Run-TryBot: Meng Zhuo <[email protected]>
TryBot-Result: Gopher Robot <[email protected]>
  • Loading branch information
mengzhuo committed Aug 30, 2022
1 parent e8f0340 commit 4381c61
Show file tree
Hide file tree
Showing 3 changed files with 154 additions and 1 deletion.
18 changes: 18 additions & 0 deletions src/crypto/sha512/sha512block_arm64.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
// Copyright 2022 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package sha512

import "internal/cpu"

func block(dig *digest, p []byte) {
if cpu.ARM64.HasSHA512 {
blockAsm(dig, p)
return
}
blockGeneric(dig, p)
}

//go:noescape
func blockAsm(dig *digest, p []byte)
135 changes: 135 additions & 0 deletions src/crypto/sha512/sha512block_arm64.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
// Copyright 2022 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Based on the Linux Kernel with the following comment:
// Algorithm based on https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fb87127bcefc17efab757606e1b1e333fd614dd0
// Originally written by Ard Biesheuvel <[email protected]>

#include "textflag.h"

#define SHA512TRANS(i0, i1, i2, i3, i4, rc0, in0) \
VADD in0.D2, rc0.D2, V5.D2 \
VEXT $8, i3.B16, i2.B16, V6.B16 \
VEXT $8, V5.B16, V5.B16, V5.B16 \
VEXT $8, i2.B16, i1.B16, V7.B16 \
VADD V5.D2, i3.D2, i3.D2 \

#define SHA512ROUND(i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4) \
VLD1.P 16(R4), [rc1.D2] \
SHA512TRANS(i0, i1, i2, i3, i4, rc0, in0) \
VEXT $8, in4.B16, in3.B16, V5.B16 \
SHA512SU0 in1.D2, in0.D2 \
SHA512H V7.D2, V6, i3 \
SHA512SU1 V5.D2, in2.D2, in0.D2 \
VADD i3.D2, i1.D2, i4.D2 \
SHA512H2 i0.D2, i1, i3

#define SHA512ROUND_NO_UPDATE(i0, i1, i2, i3, i4, rc0, rc1, in0) \
VLD1.P 16(R4), [rc1.D2] \
SHA512TRANS(i0, i1, i2, i3, i4, rc0, in0) \
SHA512H V7.D2, V6, i3 \
VADD i3.D2, i1.D2, i4.D2 \
SHA512H2 i0.D2, i1, i3

#define SHA512ROUND_LAST(i0, i1, i2, i3, i4, rc0, in0) \
SHA512TRANS(i0, i1, i2, i3, i4, rc0, in0) \
SHA512H V7.D2, V6, i3 \
VADD i3.D2, i1.D2, i4.D2 \
SHA512H2 i0.D2, i1, i3

// func blockAsm(dig *digest, p []byte)
TEXT ·blockAsm(SB),NOSPLIT,$0
MOVD dig+0(FP), R0
MOVD p_base+8(FP), R1
MOVD p_len+16(FP), R2
MOVD ·_K+0(SB), R3

// long enough to prefetch
PRFM (R3), PLDL3KEEP
// load digest
VLD1 (R0), [V8.D2, V9.D2, V10.D2, V11.D2]
loop:
// load digest in V0-V3 keeping original in V8-V11
VMOV V8.B16, V0.B16
VMOV V9.B16, V1.B16
VMOV V10.B16, V2.B16
VMOV V11.B16, V3.B16

// load message data in V12-V19
VLD1.P 64(R1), [V12.D2, V13.D2, V14.D2, V15.D2]
VLD1.P 64(R1), [V16.D2, V17.D2, V18.D2, V19.D2]

// convert message into big endian format
VREV64 V12.B16, V12.B16
VREV64 V13.B16, V13.B16
VREV64 V14.B16, V14.B16
VREV64 V15.B16, V15.B16
VREV64 V16.B16, V16.B16
VREV64 V17.B16, V17.B16
VREV64 V18.B16, V18.B16
VREV64 V19.B16, V19.B16

MOVD R3, R4
// load first 4 round consts in V20-V23
VLD1.P 64(R4), [V20.D2, V21.D2, V22.D2, V23.D2]

SHA512ROUND(V0, V1, V2, V3, V4, V20, V24, V12, V13, V19, V16, V17)
SHA512ROUND(V3, V0, V4, V2, V1, V21, V25, V13, V14, V12, V17, V18)
SHA512ROUND(V2, V3, V1, V4, V0, V22, V26, V14, V15, V13, V18, V19)
SHA512ROUND(V4, V2, V0, V1, V3, V23, V27, V15, V16, V14, V19, V12)
SHA512ROUND(V1, V4, V3, V0, V2, V24, V28, V16, V17, V15, V12, V13)

SHA512ROUND(V0, V1, V2, V3, V4, V25, V29, V17, V18, V16, V13, V14)
SHA512ROUND(V3, V0, V4, V2, V1, V26, V30, V18, V19, V17, V14, V15)
SHA512ROUND(V2, V3, V1, V4, V0, V27, V31, V19, V12, V18, V15, V16)
SHA512ROUND(V4, V2, V0, V1, V3, V28, V24, V12, V13, V19, V16, V17)
SHA512ROUND(V1, V4, V3, V0, V2, V29, V25, V13, V14, V12, V17, V18)

SHA512ROUND(V0, V1, V2, V3, V4, V30, V26, V14, V15, V13, V18, V19)
SHA512ROUND(V3, V0, V4, V2, V1, V31, V27, V15, V16, V14, V19, V12)
SHA512ROUND(V2, V3, V1, V4, V0, V24, V28, V16, V17, V15, V12, V13)
SHA512ROUND(V4, V2, V0, V1, V3, V25, V29, V17, V18, V16, V13, V14)
SHA512ROUND(V1, V4, V3, V0, V2, V26, V30, V18, V19, V17, V14, V15)

SHA512ROUND(V0, V1, V2, V3, V4, V27, V31, V19, V12, V18, V15, V16)
SHA512ROUND(V3, V0, V4, V2, V1, V28, V24, V12, V13, V19, V16, V17)
SHA512ROUND(V2, V3, V1, V4, V0, V29, V25, V13, V14, V12, V17, V18)
SHA512ROUND(V4, V2, V0, V1, V3, V30, V26, V14, V15, V13, V18, V19)
SHA512ROUND(V1, V4, V3, V0, V2, V31, V27, V15, V16, V14, V19, V12)

SHA512ROUND(V0, V1, V2, V3, V4, V24, V28, V16, V17, V15, V12, V13)
SHA512ROUND(V3, V0, V4, V2, V1, V25, V29, V17, V18, V16, V13, V14)
SHA512ROUND(V2, V3, V1, V4, V0, V26, V30, V18, V19, V17, V14, V15)
SHA512ROUND(V4, V2, V0, V1, V3, V27, V31, V19, V12, V18, V15, V16)
SHA512ROUND(V1, V4, V3, V0, V2, V28, V24, V12, V13, V19, V16, V17)

SHA512ROUND(V0, V1, V2, V3, V4, V29, V25, V13, V14, V12, V17, V18)
SHA512ROUND(V3, V0, V4, V2, V1, V30, V26, V14, V15, V13, V18, V19)
SHA512ROUND(V2, V3, V1, V4, V0, V31, V27, V15, V16, V14, V19, V12)
SHA512ROUND(V4, V2, V0, V1, V3, V24, V28, V16, V17, V15, V12, V13)
SHA512ROUND(V1, V4, V3, V0, V2, V25, V29, V17, V18, V16, V13, V14)

SHA512ROUND(V0, V1, V2, V3, V4, V26, V30, V18, V19, V17, V14, V15)
SHA512ROUND(V3, V0, V4, V2, V1, V27, V31, V19, V12, V18, V15, V16)

SHA512ROUND_NO_UPDATE(V2, V3, V1, V4, V0, V28, V24, V12)
SHA512ROUND_NO_UPDATE(V4, V2, V0, V1, V3, V29, V25, V13)
SHA512ROUND_NO_UPDATE(V1, V4, V3, V0, V2, V30, V26, V14)
SHA512ROUND_NO_UPDATE(V0, V1, V2, V3, V4, V31, V27, V15)

SHA512ROUND_LAST(V3, V0, V4, V2, V1, V24, V16)
SHA512ROUND_LAST(V2, V3, V1, V4, V0, V25, V17)
SHA512ROUND_LAST(V4, V2, V0, V1, V3, V26, V18)
SHA512ROUND_LAST(V1, V4, V3, V0, V2, V27, V19)

// add result to digest
VADD V0.D2, V8.D2, V8.D2
VADD V1.D2, V9.D2, V9.D2
VADD V2.D2, V10.D2, V10.D2
VADD V3.D2, V11.D2, V11.D2
SUB $128, R2
CBNZ R2, loop

VST1 [V8.D2, V9.D2, V10.D2, V11.D2], (R0)
RET
2 changes: 1 addition & 1 deletion src/crypto/sha512/sha512block_generic.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build !amd64 && !s390x && !ppc64le && !ppc64
//go:build !amd64 && !arm64 && !s390x && !ppc64le && !ppc64

package sha512

Expand Down

0 comments on commit 4381c61

Please sign in to comment.