Skip to content

Commit

Permalink
add AES-GCM-SIV amd64 assembler code for AES-CTR and POLYVAL
Browse files Browse the repository at this point in the history
This commit adds AMD64 assembler implementations for AES-CTR
(AES-GCM-SIV) and POLYVAL. The assembler implementations are
still quite generic and use possible optimiaztions like
combining decryption and authentication in `Open(...)`.
Such more sophisticated optimizations will be introduced over
time.

The AMD64 assembler code significantly improves performance
on machines with AES-NI and PCLMULQDQ instruction:

```
name               old time/op    new time/op      delta
AES128GCMSeal64-4    5.24µs ± 0%      0.47µs ± 1%    -91.09%  (p=0.029 n=4+4)
AES128GCMSeal1K-4    57.1µs ± 0%       1.3µs ± 0%    -97.71%  (p=0.029 n=4+4)
AES128GCMSeal8K-4     445µs ± 0%         7µs ± 0%    -98.34%  (p=0.029 n=4+4)
AES128GCMOpen64-4    5.27µs ± 0%      0.48µs ± 0%    -90.82%  (p=0.029 n=4+4)
AES128GCMOpen1K-4    57.2µs ± 0%       1.3µs ± 1%    -97.70%  (p=0.029 n=4+4)
AES128GCMOpen8K-4     444µs ± 0%         7µs ± 0%    -98.34%  (p=0.029 n=4+4)
AES256GCMSeal64-4    5.49µs ± 1%      0.57µs ± 0%    -89.66%  (p=0.029 n=4+4)
AES256GCMSeal1K-4    57.9µs ± 0%       1.5µs ± 0%    -97.45%  (p=0.029 n=4+4)
AES256GCMSeal8K-4     449µs ± 0%         8µs ± 0%    -98.18%  (p=0.029 n=4+4)
AES256GCMOpen64-4    5.49µs ± 0%      0.59µs ± 0%    -89.32%  (p=0.029 n=4+4)
AES256GCMOpen1K-4    57.6µs ± 0%       1.5µs ± 0%    -97.40%  (p=0.029 n=4+4)
AES256GCMOpen8K-4     446µs ± 0%         8µs ± 0%    -98.16%  (p=0.029 n=4+4)

name               old speed      new speed        delta
AES128GCMSeal64-4  12.2MB/s ± 0%   137.1MB/s ± 1%  +1021.43%  (p=0.029 n=4+4)
AES128GCMSeal1K-4  17.9MB/s ± 0%   784.8MB/s ± 0%  +4273.17%  (p=0.029 n=4+4)
AES128GCMSeal8K-4  18.4MB/s ± 0%  1106.5MB/s ± 0%  +5911.82%  (p=0.029 n=4+4)
AES128GCMOpen64-4  12.1MB/s ± 0%   132.2MB/s ± 0%   +989.41%  (p=0.029 n=4+4)
AES128GCMOpen1K-4  17.9MB/s ± 0%   776.9MB/s ± 1%  +4241.63%  (p=0.029 n=4+4)
AES128GCMOpen8K-4  18.4MB/s ± 0%  1107.2MB/s ± 0%  +5907.46%  (p=0.029 n=4+4)
AES256GCMSeal64-4  11.7MB/s ± 1%   112.7MB/s ± 0%   +866.88%  (p=0.029 n=4+4)
AES256GCMSeal1K-4  17.7MB/s ± 0%   692.6MB/s ± 0%  +3813.22%  (p=0.029 n=4+4)
AES256GCMSeal8K-4  18.3MB/s ± 0%  1002.0MB/s ± 0%  +5386.18%  (p=0.029 n=4+4)
AES256GCMOpen64-4  11.7MB/s ± 0%   109.1MB/s ± 0%   +835.68%  (p=0.029 n=4+4)
AES256GCMOpen1K-4  17.8MB/s ± 0%   682.5MB/s ± 0%  +3739.66%  (p=0.029 n=4+4)
AES256GCMOpen8K-4  18.4MB/s ± 0%  1000.1MB/s ± 0%  +5347.14%  (p=0.029 n=4+4)
```
  • Loading branch information
Andreas Auernhammer committed Sep 22, 2018
1 parent 339dd21 commit 5ff4065
Show file tree
Hide file tree
Showing 6 changed files with 414 additions and 15 deletions.
10 changes: 5 additions & 5 deletions aes_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -43,17 +43,17 @@ TEXT ·encryptBlock(SB), 4, $0-80
MOVQ keyLen+72(FP), DX

MOVUPS (0 * 16)(SI), X0
CMPQ DX, $24
JE aes_192
JB aes_128
CMPQ DX, $24
JE aes_192
JB aes_128

aes_256:
AES_256(X0, X1, AX)
JMP return
JMP return

aes_192:
AES_192(X0, X1, AX)
JMP return
JMP return

aes_128:
AES_128(X0, X1, AX)
Expand Down
67 changes: 67 additions & 0 deletions aes_gcm_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,78 @@
package siv

import (
"crypto/aes"
"crypto/cipher"
"crypto/subtle"

"golang.org/x/sys/cpu"
)

func polyval(tag *[16]byte, additionalData, plaintext, key []byte)

func aesGcmXORKeyStream(dst, src, iv, keys []byte, keyLen uint64)

func newGCM(key []byte) aead {
if cpu.X86.HasAES && cpu.X86.HasPCLMULQDQ {
block, _ := aes.NewCipher(key)
return &aesGcmSivAsm{block: block, keyLen: len(key)}
}
return newGCMGeneric(key)
}

var _ aead = (*aesGcmSivAsm)(nil)

type aesGcmSivAsm struct {
block cipher.Block
keyLen int
}

func (c *aesGcmSivAsm) seal(ciphertext, nonce, plaintext, additionalData []byte) {
encKey, authKey := deriveKeys(nonce, c.block, c.keyLen)

var tag [16]byte
polyval(&tag, additionalData, plaintext, authKey)
for i := range nonce {
tag[i] ^= nonce[i]
}
tag[15] &= 0x7f

var encKeys [240]byte
keySchedule(encKeys[:], encKey)
encryptBlock(tag[:], tag[:], encKeys[:], uint64(len(encKey)))
ctrBlock := tag
ctrBlock[15] |= 0x80

aesGcmXORKeyStream(ciphertext, plaintext, ctrBlock[:], encKeys[:], uint64(len(encKey)))
copy(ciphertext[len(plaintext):], tag[:])
}

func (c *aesGcmSivAsm) open(plaintext, nonce, ciphertext, additionalData []byte) error {
tag := ciphertext[len(ciphertext)-16:]
ciphertext = ciphertext[:len(ciphertext)-16]

encKey, authKey := deriveKeys(nonce, c.block, c.keyLen)
var ctrBlock [16]byte
copy(ctrBlock[:], tag)
ctrBlock[15] |= 0x80

var encKeys [240]byte
keySchedule(encKeys[:], encKey)
aesGcmXORKeyStream(plaintext, ciphertext, ctrBlock[:], encKeys[:], uint64(len(encKey)))

var sum [16]byte
polyval(&sum, additionalData, plaintext, authKey)
for i := range nonce {
sum[i] ^= nonce[i]
}
sum[15] &= 0x7f

encryptBlock(sum[:], sum[:], encKeys[:], uint64(len(encKey)))
if subtle.ConstantTimeCompare(sum[:], tag[:]) != 1 {
for i := range plaintext {
plaintext[i] = 0
}
return errOpen
}
return nil
}
229 changes: 229 additions & 0 deletions aes_gcm_amd64.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
// Copyright (c) 2018 Andreas Auernhammer. All rights reserved.
// Use of this source code is governed by a license that can be
// found in the LICENSE file.

// +build amd64,!gccgo,!appengine

#include "textflag.h"
#include "aes_macros_amd64.s"

DATA ·one<>+0x00(SB)/8, $1
DATA ·one<>+0x08(SB)/8, $0
GLOBL ·one<>(SB), (NOPTR+RODATA), $16

DATA ·polyvalMask<>+0x00(SB)/8, $0x0000000000000001
DATA ·polyvalMask<>+0x08(SB)/8, $0xc200000000000000
GLOBL ·polyvalMask<>(SB), (NOPTR+RODATA), $16

// func aesGcmXORKeyStream(dst, src, iv, keys []byte, keyLen uint64)
TEXT ·aesGcmXORKeyStream(SB), 4, $0-104
MOVQ dst+0(FP), DI
MOVQ src+24(FP), SI
MOVQ src_len+32(FP), DX
MOVQ iv+48(FP), BX
MOVQ keys+72(FP), AX
MOVQ keyLen+96(FP), CX

TESTQ DX, DX
JZ return

MOVUPS (0 * 16)(BX), X10
MOVUPS ·one<>(SB), X9

CMPQ DX, $64
JB loop_1
CMPQ DX, $128
JB loop_4

loop_8:
MOVAPS X10, X0
PADDD X9, X10
MOVAPS X10, X1
PADDD X9, X10
MOVAPS X10, X2
PADDD X9, X10
MOVAPS X10, X3
PADDD X9, X10
MOVAPS X10, X4
PADDD X9, X10
MOVAPS X10, X5
PADDD X9, X10
MOVAPS X10, X6
PADDD X9, X10
MOVAPS X10, X7
PADDD X9, X10

CMPQ CX, $16
JE aes_128_8

aes_256_8:
AES_256_8(X0, X1, X2, X3, X4, X5, X6, X7, X8, AX)
JMP xor_8

aes_128_8:
AES_128_8(X0, X1, X2, X3, X4, X5, X6, X7, X8, AX)

xor_8:
PXOR (0 * 16)(SI), X0
PXOR (1 * 16)(SI), X1
PXOR (2 * 16)(SI), X2
PXOR (3 * 16)(SI), X3
PXOR (4 * 16)(SI), X4
PXOR (5 * 16)(SI), X5
PXOR (6 * 16)(SI), X6
PXOR (7 * 16)(SI), X7
MOVUPS X0, (0 * 16)(DI)
MOVUPS X1, (1 * 16)(DI)
MOVUPS X2, (2 * 16)(DI)
MOVUPS X3, (3 * 16)(DI)
MOVUPS X4, (4 * 16)(DI)
MOVUPS X5, (5 * 16)(DI)
MOVUPS X6, (6 * 16)(DI)
MOVUPS X7, (7 * 16)(DI)
ADDQ $128, SI
ADDQ $128, DI
SUBQ $128, DX
CMPQ DX, $128
JAE loop_8
TESTQ DX, DX
JZ return
CMPQ DX, $64
JB loop_1

loop_4:
MOVAPS X10, X0
PADDD X9, X10
MOVAPS X10, X1
PADDD X9, X10
MOVAPS X10, X2
PADDD X9, X10
MOVAPS X10, X3
PADDD X9, X10

CMPQ CX, $16
JE aes_128_4

aes_256_4:
AES_256_4(X0, X1, X2, X3, X4, AX)
JMP xor_4

aes_128_4:
AES_128_4(X0, X1, X2, X3, X4, AX)

xor_4:
PXOR (0 * 16)(SI), X0
PXOR (1 * 16)(SI), X1
PXOR (2 * 16)(SI), X2
PXOR (3 * 16)(SI), X3
MOVUPS X0, (0 * 16)(DI)
MOVUPS X1, (1 * 16)(DI)
MOVUPS X2, (2 * 16)(DI)
MOVUPS X3, (3 * 16)(DI)
ADDQ $64, SI
ADDQ $64, DI
SUBQ $64, DX
CMPQ DX, $64
JAE loop_4
TESTQ DX, DX
JZ return

loop_1:
MOVAPS X10, X0
PADDD X9, X10
CMPQ CX, $16
JE aes_128_1

aes_256_1:
AES_256(X0, X1, AX)
JMP xor_1

aes_128_1:
AES_128(X0, X1, AX)

xor_1:
CMPQ DX, $16
JB finalize
PXOR 0(SI), X0
MOVUPS X0, 0(DI)

ADDQ $16, SI
ADDQ $16, DI
SUBQ $16, DX
JMP loop_1

finalize:
TESTQ DX, DX
JZ return

finalize_loop:
MOVQ X0, R10
PSRLDQ $1, X0
MOVB 0(SI), R11
XORQ R11, R10
MOVB R10, 0(DI)
INCQ SI
INCQ DI
DECQ DX
JNZ finalize_loop

return:
RET

// func polyval(tag *[16]byte, additionalData, plaintext, key []byte)
TEXT ·polyval(SB), $0-64
MOVQ tag+0(FP), DI
MOVQ additionalData+8(FP), SI
MOVQ additionalData_len+16(FP), DX
MOVQ plaintext+32(FP), BX
MOVQ plaintext_len+40(FP), CX
MOVQ key+56(FP), AX

MOVQ DX, R14
MOVQ CX, R15
SHLQ $3, R14
SHLQ $3, R15
MOVOU 0(DI), X0
MOVOU 0(AX), X1
MOVOU ·polyvalMask<>(SB), X2

MOVQ $2, AX

loop:
CMPQ DX, $16
JB finalize
MOVUPS 0(SI), X7
PXOR X7, X0
MULTIPLY(X0, X1, X2, X3, X4, X5, X6)
ADDQ $16, SI
SUBQ $16, DX
JMP loop

finalize:
TESTQ DX, DX
JZ process_next
MOVQ DI, R11
PXOR X3, X3
MOVOU X3, 0(R11)

finalize_loop:
MOVB 0(SI), R10
MOVB R10, 0(R11)
INCQ SI
INCQ R11
DECQ DX
JNZ finalize_loop
PXOR 0(DI), X0
MULTIPLY(X0, X1, X2, X3, X4, X5, X6)

process_next:
MOVQ BX, SI
MOVQ CX, DX
DECQ AX
JNZ loop

MOVQ R14, 0(DI)
MOVQ R15, 8(DI)
PXOR 0(DI), X0
MULTIPLY(X0, X1, X2, X3, X4, X5, X6)
MOVOU X0, 0(DI)
RET
20 changes: 10 additions & 10 deletions aes_gcm_generic.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ type aesGcmSivGeneric struct {
}

func (c *aesGcmSivGeneric) seal(ciphertext, nonce, plaintext, additionalData []byte) {
encKey, authKey := c.deriveKeys(nonce)
encKey, authKey := deriveKeys(nonce, c.block, c.keyLen)

var tag [16]byte
polyvalGeneric(&tag, additionalData, plaintext, authKey)
Expand All @@ -46,7 +46,7 @@ func (c *aesGcmSivGeneric) open(plaintext, nonce, ciphertext, additionalData []b
tag := ciphertext[len(ciphertext)-16:]
ciphertext = ciphertext[:len(ciphertext)-16]

encKey, authKey := c.deriveKeys(nonce)
encKey, authKey := deriveKeys(nonce, c.block, c.keyLen)
var ctrBlock [16]byte
copy(ctrBlock[:], tag)
ctrBlock[15] |= 0x80
Expand All @@ -70,39 +70,39 @@ func (c *aesGcmSivGeneric) open(plaintext, nonce, ciphertext, additionalData []b
return nil
}

func (c *aesGcmSivGeneric) deriveKeys(nonce []byte) (encKey, authKey []byte) {
func deriveKeys(nonce []byte, block cipher.Block, keyLen int) (encKey, authKey []byte) {
var counter [16]byte
encKey = make([]byte, 32)
authKey = make([]byte, 16)
copy(counter[4:], nonce[:])

var tmp [16]byte
binary.LittleEndian.PutUint32(counter[:4], 0)
c.block.Encrypt(tmp[:], counter[:])
block.Encrypt(tmp[:], counter[:])
copy(authKey[0:], tmp[:8])

binary.LittleEndian.PutUint32(counter[:4], 1)
c.block.Encrypt(tmp[:], counter[:])
block.Encrypt(tmp[:], counter[:])
copy(authKey[8:], tmp[:8])

binary.LittleEndian.PutUint32(counter[:4], 2)
c.block.Encrypt(tmp[:], counter[:])
block.Encrypt(tmp[:], counter[:])
copy(encKey[0:], tmp[:8])

binary.LittleEndian.PutUint32(counter[:4], 3)
c.block.Encrypt(tmp[:], counter[:])
block.Encrypt(tmp[:], counter[:])
copy(encKey[8:], tmp[:8])

if c.keyLen == 16 {
if keyLen == 16 {
return encKey[:16], authKey
}

binary.LittleEndian.PutUint32(counter[:4], 4)
c.block.Encrypt(tmp[:], counter[:])
block.Encrypt(tmp[:], counter[:])
copy(encKey[16:], tmp[:8])

binary.LittleEndian.PutUint32(counter[:4], 5)
c.block.Encrypt(tmp[:], counter[:])
block.Encrypt(tmp[:], counter[:])
copy(encKey[24:], tmp[:8])

return encKey, authKey
Expand Down
Loading

0 comments on commit 5ff4065

Please sign in to comment.