Skip to content

Efficient Software Implementations of ZUC

Sun Yimin edited this page Nov 26, 2024 · 11 revisions

主要涉及:

  1. Keystream generator
  1. SIMD + AES-NI
  1. EIA
  1. Carryless multiplication, 无进位乘法, Use instruction PCLMULQDQ (AMD64)
  1. Multi-Buffer, 多路并行

S1 Sbox生成

改编自AES 和 SM4 的 S 盒生成方法简介

from pyfinite import ffield

gen = 0b110001011
F = ffield.FField(8, gen, useLUT=0) # 这里一定要写useLUT=0,不然会出问题。。。

A = [0b01110111, 0b10111011, 0b11011101, 0b11101110, 0b11001011, 0b01101101, 0b00111110, 0b10010111]

def zuc_sbox_gen(x):
    '''
    输入x,输出S(x)
    '''
    x_inv = F.Inverse(x)
    y = 0
    for i, a in enumerate(A):
        if(x_inv&(1<<(7-i))):
            y ^= a  # 若该bit为1,则异或相应列
    return y^0x55

def print_table(table):
    for i, s in enumerate(table):
        print(f'0x%02X'%s,',', end='')    
        if (i+1) % 16 == 0:
            print()

sbox = []
for i in range(256):
    if i > 0:
        sbox.append(zuc_sbox_gen(i))  # 生成sbox
    else:
        sbox.append(0x55)

print_table(sbox)

从AES S盒计算ZUC S1

参考aes和sm4s盒复合域实现方法的做法:
$S_{zuc}(x)=L(S_{aes}(Mx)+C$,下面我们尝试进行推导 L, M, C
假设复合域求逆运算为 $f$,则:
$S_{aes}(x)=A_{aes}X_{aes}f(X^{-1}_{aes}x) + 0x63$

$S_{zuc}(x)=A_{zuc}X_{zuc}f(X^{-1}_{zuc}x) + 0x55$

得到
$L=A_{zuc}X_{zuc}X^{-1}_{aes}A^{-1}{aes} \ $

$M=X_{aes}X^{-1}_{zuc}$

$C=L\ 0x63+0x55$

只有这种S盒和AES盒构造完全同构的,用AESENCLAST时,使用0x63作为enc_key才有点意义。

from pyfinite import ffield
from pyfinite import genericmatrix

XOR = lambda x,y:x^y
AND = lambda x,y:x&y
DIV = lambda x,y:x

def aes_f():
    gen = 0b100011011
    return ffield.FField(8, gen, useLUT=0)

def zuc_f():
    gen = 0b110001011
    return ffield.FField(8, gen, useLUT=0)    

aesf = aes_f()
zucf = zuc_f()

def field_pow2(x, F):
    return F.Multiply(x, x)

def field_pow3(x, F):
    return F.Multiply(x, field_pow2(x, F))

def field_pow4(x, F):
    return field_pow2(field_pow2(x, F), F)

def field_pow16(x, F):
    return field_pow4(field_pow4(x, F), F)    

def get_all_WZY(F):
    result_list = []
    for i in range(256):
        if field_pow2(i, F)^i^1 == 0:
            W=i
            W_2 = field_pow2(W, F)
            N = W_2
            for j in range(256):
                if field_pow2(j, F)^j^W_2 == 0:
                    Z = j
                    Z_4 = field_pow4(Z, F)
                    u = F.Multiply(field_pow2(N, F), Z)
                    for k in range(256):
                        if field_pow2(k, F)^k^u == 0:
                            Y = k
                            Y_16 = field_pow16(k, F)
                            result_list.append([W, W_2, Z, Z_4, Y, Y_16])
    return result_list

def gen_X(F, W, W_2, Z, Z_4, Y, Y_16):
    W_2_Z_4_Y_16 = F.Multiply(F.Multiply(W_2, Z_4), Y_16)
    W_Z_4_Y_16 = F.Multiply(F.Multiply(W, Z_4), Y_16)
    W_2_Z_Y_16 = F.Multiply(F.Multiply(W_2, Z), Y_16)
    W_Z_Y_16 = F.Multiply(F.Multiply(W, Z), Y_16)
    W_2_Z_4_Y = F.Multiply(F.Multiply(W_2, Z_4), Y)
    W_Z_4_Y = F.Multiply(F.Multiply(W, Z_4), Y)
    W_2_Z_Y = F.Multiply(F.Multiply(W_2, Z), Y)
    W_Z_Y = F.Multiply(F.Multiply(W, Z), Y)
    return [W_2_Z_4_Y_16, W_Z_4_Y_16, W_2_Z_Y_16, W_Z_Y_16, W_2_Z_4_Y, W_Z_4_Y, W_2_Z_Y, W_Z_Y]

def to_matrix(x):
    m = genericmatrix.GenericMatrix(size=(8,8), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
    m.SetRow(0, [(x[0] & 0x80) >> 7, (x[1] & 0x80) >> 7, (x[2] & 0x80) >> 7, (x[3] & 0x80) >> 7, (x[4] & 0x80) >> 7, (x[5] & 0x80) >> 7, (x[6] & 0x80) >> 7, (x[7] & 0x80) >> 7]) 
    m.SetRow(1, [(x[0] & 0x40) >> 6, (x[1] & 0x40) >> 6, (x[2] & 0x40) >> 6, (x[3] & 0x40) >> 6, (x[4] & 0x40) >> 6, (x[5] & 0x40) >> 6, (x[6] & 0x40) >> 6, (x[7] & 0x40) >> 6]) 
    m.SetRow(2, [(x[0] & 0x20) >> 5, (x[1] & 0x20) >> 5, (x[2] & 0x20) >> 5, (x[3] & 0x20) >> 5, (x[4] & 0x20) >> 5, (x[5] & 0x20) >> 5, (x[6] & 0x20) >> 5, (x[7] & 0x20) >> 5]) 
    m.SetRow(3, [(x[0] & 0x10) >> 4, (x[1] & 0x10) >> 4, (x[2] & 0x10) >> 4, (x[3] & 0x10) >> 4, (x[4] & 0x10) >> 4, (x[5] & 0x10) >> 4, (x[6] & 0x10) >> 4, (x[7] & 0x10) >> 4]) 
    m.SetRow(4, [(x[0] & 0x08) >> 3, (x[1] & 0x08) >> 3, (x[2] & 0x08) >> 3, (x[3] & 0x08) >> 3, (x[4] & 0x08) >> 3, (x[5] & 0x08) >> 3, (x[6] & 0x08) >> 3, (x[7] & 0x08) >> 3]) 
    m.SetRow(5, [(x[0] & 0x04) >> 2, (x[1] & 0x04) >> 2, (x[2] & 0x04) >> 2, (x[3] & 0x04) >> 2, (x[4] & 0x04) >> 2, (x[5] & 0x04) >> 2, (x[6] & 0x04) >> 2, (x[7] & 0x04) >> 2]) 
    m.SetRow(6, [(x[0] & 0x02) >> 1, (x[1] & 0x02) >> 1, (x[2] & 0x02) >> 1, (x[3] & 0x02) >> 1, (x[4] & 0x02) >> 1, (x[5] & 0x02) >> 1, (x[6] & 0x02) >> 1, (x[7] & 0x02) >> 1]) 
    m.SetRow(7, [(x[0] & 0x01) >> 0, (x[1] & 0x01) >> 0, (x[2] & 0x01) >> 0, (x[3] & 0x01) >> 0, (x[4] & 0x01) >> 0, (x[5] & 0x01) >> 0, (x[6] & 0x01) >> 0, (x[7] & 0x01) >> 0]) 
    return m

def matrix_col_byte(c):
    return (c[0] << 7) ^ (c[1] << 6) ^ (c[2] << 5) ^ (c[3] << 4) ^ (c[4] << 3) ^ (c[5] << 2) ^ (c[6] << 1) ^ (c[7] << 0)

def matrix_row_byte(c):
    return (c[0] << 7) ^ (c[1] << 6) ^ (c[2] << 5) ^ (c[3] << 4) ^ (c[4] << 3) ^ (c[5] << 2) ^ (c[6] << 1) ^ (c[7] << 0)

def matrix_cols(m):
    x = []
    for i in range(8):
        c = m.GetColumn(i)
        x.append(matrix_col_byte(c))
    return x

def matrix_rows(m):
    x = []
    for i in range(8):
        r = m.GetRow(i)
        x.append(matrix_row_byte(r))
    return x

def gen_X_inv(x):
    m = to_matrix(x)
    m_inv = m.Inverse()
    return matrix_cols(m_inv)

def G4_mul(x, y):
    '''
    GF(2^2) multiply operator, normal basis is {W^2, W}
    '''
    a = (x & 0x02) >> 1
    b = x & 0x01
    c = (y & 0x02) >> 1
    d = y & 0x01
    e = (a ^ b) & (c ^ d)
    return (((a & c) ^ e) << 1) | ((b & d) ^ e)

def G4_mul_N(x):
    '''
    GF(2^2) multiply N, normal basis is {W^2, W}, N = W^2
    '''
    a = (x & 0x02) >> 1
    b = x & 0x01
    p = b
    q = a ^ b
    return (p << 1) | q

def G4_mul_N2(x):
    '''
    GF(2^2) multiply N^2, normal basis is {W^2, W}, N = W^2
    '''
    a = (x & 0x02) >> 1
    b = x & 0x01
    return ((a ^ b) << 1) | a

def G4_inv(x):
    '''
    GF(2^2) inverse opertor
    '''        
    a = (x & 0x02) >> 1
    b = x & 0x01
    return (b << 1) | a

def G16_mul(x, y):
    '''
    GF(2^4) multiply operator, normal basis is {Z^4, Z}
    '''
    a = (x & 0xc) >> 2
    b = x & 0x03
    c = (y & 0xc) >> 2
    d = y & 0x03
    e = G4_mul(a ^ b, c ^ d)
    e = G4_mul_N(e)
    p = G4_mul(a, c) ^ e
    q = G4_mul(b, d) ^ e
    return (p << 2) | q

def G16_sq_mul_u(x):
    '''
    GF(2^4) x^2 * u operator, u = N^2 Z, N = W^2
    '''    
    a = (x & 0xc) >> 2
    b = x & 0x03
    p = G4_inv(a ^ b)
    q = G4_mul_N2(G4_inv(b))
    return (p << 2) | q

def G16_inv(x):
    '''
    GF(2^4) inverse opertor
    '''
    a = (x & 0xc) >> 2
    b = x & 0x03
    c = G4_mul_N(G4_inv(a ^ b))
    d = G4_mul(a, b)
    e = G4_inv(c ^ d)
    p = G4_mul(e, b)
    q = G4_mul(e, a)
    return (p << 2) | q

def G256_inv(x):
    '''
    GF(2^8) inverse opertor
    '''
    a = (x & 0xf0) >> 4
    b = x & 0x0f
    c = G16_sq_mul_u(a ^ b)
    d = G16_mul(a, b)
    e = G16_inv(c ^ d)
    p = G16_mul(e, b)
    q = G16_mul(e, a)
    return (p << 4) | q

def G256_new_basis(x, b):
    '''
    x presentation under new basis b
    '''
    y = 0
    for i in range(8):
        if x & (1<<((7-i))):
            y ^= b[i]
    return y

AES_A = [0b10001111, 0b11000111, 0b11100011, 0b11110001, 0b11111000, 0b01111100, 0b00111110, 0b00011111]
AES_C = [0, 1, 1, 0, 0, 0, 1, 1]

def AES_SBOX(X, X_inv):
    sbox = []
    for i in range(256):
        t = G256_new_basis(i, X_inv)
        t = G256_inv(t)
        t = G256_new_basis(t, X)
        t = G256_new_basis(t, AES_A)
        sbox.append(t ^ 0x63)
    return sbox

def print_sbox(sbox):
    for i, s in enumerate(sbox):
        print(f'%02x'%s,',', end='')    
        if (i+1) % 16 == 0:
            print()

def print_all_aes_sbox():
    result_list = get_all_WZY(aesf)
    for i, v in enumerate(result_list):
        X = gen_X(aesf, v[0], v[1], v[2], v[3], v[4], v[5])
        X_inv = gen_X_inv(X)
        print_sbox(AES_SBOX(X, X_inv))
        print()

ZUC_A = [0b01110111, 0b10111011, 0b11011101, 0b11101110, 0b11001011, 0b01101101, 0b00111110, 0b10010111]
ZUC_C = [0, 1, 0, 1, 0, 1, 0, 1]

def ZUC_SBOX(X, X_inv):
    sbox = []
    for i in range(256):
        t = G256_new_basis(i, X_inv)
        t = G256_inv(t)
        t = G256_new_basis(t, X)
        t = G256_new_basis(t, ZUC_A)
        sbox.append(t ^ 0x55)
    return sbox

def print_all_zuc_sbox():
    result_list = get_all_WZY(zucf)
    for i, v in enumerate(result_list):
        X = gen_X(zucf, v[0], v[1], v[2], v[3], v[4], v[5])
        X_inv = gen_X_inv(X)
        print_sbox(ZUC_SBOX(X, X_inv))
        print()    

def print_m(m):
    for i, s in enumerate(m):
        print(f'0x%02x'%s,',', end='')  

def gen_all_m1_c1_m2_c2():
    aes_result_list = get_all_WZY(aesf)
    zuc_result_list = get_all_WZY(zucf)
    Aaes = to_matrix(AES_A)
    Aaes_inv = Aaes.Inverse()
    Azuc = to_matrix(ZUC_A)
    Caes = genericmatrix.GenericMatrix(size=(8, 1), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
    for i in range(8):
        Caes.SetRow(i, [AES_C[i]])
    Czuc = genericmatrix.GenericMatrix(size=(8, 1), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
    for i in range(8):
        Czuc.SetRow(i, [ZUC_C[i]])
    for i, v1 in enumerate(aes_result_list):
        Xaes = to_matrix(gen_X(aesf, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]))
        Xaes_inv = Xaes.Inverse()
        for j, v2 in enumerate(zuc_result_list):
            Xzuc = to_matrix(gen_X(zucf, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5]))
            Xzuc_inv = Xzuc.Inverse()
            M1 = Xaes * Xzuc_inv
            M2 = Azuc * Xzuc * Xaes_inv * Aaes_inv
            C2 = M2 * Caes
            print(f'M1=','', end='')
            print_m(matrix_rows(M1))
            print(f' C1=','', end='')
            print(hex(0x0))
            print(f'M2=','', end='')
            print_m(matrix_rows(M2))
            print(f' C2=','', end='')
            print(hex(0x55 ^ matrix_col_byte(C2.GetColumn(0))))
            print()

gen_all_m1_c1_m2_c2()

结果:

M1= 0x28 ,0x58 ,0xf6 ,0x76 ,0x8a ,0x40 ,0x3e ,0xf3 , C1= 0x0
M2= 0x81 ,0xfd ,0x57 ,0x8e ,0xdb ,0x6d ,0xf6 ,0x2e , C2= 0xab

M1= 0x3c ,0xaa ,0xe2 ,0x90 ,0xb2 ,0x78 ,0x3e ,0x2b , C1= 0x0
M2= 0x0e ,0x43 ,0x91 ,0x08 ,0xa3 ,0x93 ,0x70 ,0x6e , C2= 0xbc

M1= 0xc6 ,0xac ,0x18 ,0x9e ,0x5a ,0x4e ,0x12 ,0x95 , C1= 0x0
M2= 0x01 ,0x5d ,0x26 ,0x88 ,0xcc ,0xb3 ,0x36 ,0x96 , C2= 0xd8

M1= 0x0c ,0x5e ,0xd2 ,0xa6 ,0xbc ,0xa8 ,0x12 ,0xbf , C1= 0x0
M2= 0x87 ,0x25 ,0xe0 ,0x07 ,0x72 ,0x82 ,0xb9 ,0xdf , C2= 0x58

M1= 0x70 ,0x7c ,0xae ,0x1e ,0xf0 ,0xc8 ,0x06 ,0xdd , C1= 0x0
M2= 0x02 ,0xa5 ,0xd8 ,0x5a ,0x05 ,0xd9 ,0xed ,0x0d , C2= 0xfe

//Intel也用了这组
M1= 0x96 ,0x50 ,0x48 ,0xd4 ,0xe4 ,0xdc ,0x06 ,0x11 , C1= 0x0
M2= 0x3a ,0xd4 ,0x1e ,0xad ,0xb2 ,0x99 ,0x1a ,0x3c , C2= 0x32

M1= 0x52 ,0x6e ,0x8c ,0x02 ,0x26 ,0xc0 ,0xf4 ,0x47 , C1= 0x0
M2= 0x95 ,0x45 ,0x66 ,0xf5 ,0x9d ,0xe7 ,0x84 ,0x15 , C2= 0xec

M1= 0x6a ,0x42 ,0xb4 ,0x16 ,0xec ,0x0a ,0xf4 ,0xa7 , C1= 0x0
M2= 0x62 ,0xf2 ,0xa0 ,0xcd ,0xec ,0xae ,0xbc ,0xeb , C2= 0xb7

计算查找表

from pyfinite import genericmatrix

def XOR(x, y): return x ^ y
def AND(x, y): return x & y
def DIV(x, y): return x

def genCMatrix(c):
    Imatrix = genericmatrix.GenericMatrix(size=(8, 1), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
    for j in range (8):
        Imatrix.SetRow(j, [(0x63 >> (7 - j)) & 1])
    return Imatrix

def matrix_from_cols(cols):
    m = genericmatrix.GenericMatrix(size=(8, 8), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
    for i in range (8):
        k = 7 - i
        j = 1 << k
        m.SetRow(i, [(cols[0] & j) >> k, (cols[1] & j) >> k, (cols[2] & j) >> k, (cols[3] & j) >> k, (cols[4] & j) >> k, (cols[5] & j) >> k, (cols[6] & j) >> k, (cols[7] & j) >> k])    

    return m

def gen_matrix_based_table(table):
    return matrix_from_cols([table[0x80] ^ table[0], table[0x40] ^ table[0], table[0x20] ^ table[0], table[0x10] ^ table[0], table[0x08] ^ table[0], table[0x04] ^ table[0], table[0x02] ^ table[0], table[0x01] ^ table[0]])

def gen_matrix_based_high_low(high, low):
    table = []
    for i in range(16):
        for j in range(16):
            table.append(high[i] ^ low[j])    
    return gen_matrix_based_table(table) 

def matrix_col_byte(c):
    return (c[0] << 7) ^ (c[1] << 6) ^ (c[2] << 5) ^ (c[3] << 4) ^ (c[4] << 3) ^ (c[5] << 2) ^ (c[6] << 1) ^ (c[7] << 0)

def gen_lookup(m, c):
    table = []
    for i in range(256):
        Imatrix = genericmatrix.GenericMatrix(size=(8, 1), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
        for j in range (8):
            Imatrix.SetRow(j, [(i >> (7 - j)) & 1])
        tmp = m * Imatrix
        table.append(matrix_col_byte(tmp.GetColumn(0)) ^ c)    
    return table

def gen_lookup_low(m, c):
    table = []
    for i in range(256):
        Imatrix = genericmatrix.GenericMatrix(size=(8, 1), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
        for j in range (8):
            if j < 4:
                Imatrix.SetRow(j, [0])
            else:
                Imatrix.SetRow(j, [(i >> (7 - j)) & 1])        
        tmp = m * Imatrix
        table.append(matrix_col_byte(tmp.GetColumn(0)) ^ c)    
    return table

def gen_lookup_high(m, c):
    table = []
    for i in range(256):
        Imatrix = genericmatrix.GenericMatrix(size=(8, 1), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
        for j in range (8):
            if j < 4:
                Imatrix.SetRow(j, [(i >> (7 - j)) & 1])
            else:
                Imatrix.SetRow(j, [0])
        tmp = m * Imatrix
        table.append(matrix_col_byte(tmp.GetColumn(0)) ^ c)    
    return table

def print_table(table):
    for i, s in enumerate(table):
        print(f'0x%02X'%s,',', end='')    
        if (i+1) % 16 == 0:
            print()

def print_high(table):
    for i, s in enumerate(table):
        if i % 16 == 0:
            print(f'0x%02X'%s,',', end='')    
    print()

def print_low(table):
    for i, s in enumerate(table):
        if i < 16:
            print(f'0x%02X'%s,',', end='')    
    print()

def to_matrix(x):
    m = genericmatrix.GenericMatrix(size=(8,8), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
    for i in range(8):
        m.SetRow(i, [(x[i] & 0x80) >> 7, (x[i] & 0x40) >> 6, (x[i] & 0x20) >> 5, (x[i] & 0x10) >> 4, (x[i] & 0x08) >> 3, (x[i] & 0x04) >> 2, (x[i] & 0x02) >> 1, (x[i] & 0x01) >> 0]) 
    return m

def gen_intel_c(m, c):
    Cmatrix = genCMatrix(0x63)
    c1 = m*Cmatrix
    return matrix_col_byte(c1.GetColumn(0)) ^ c

Mmatrix = to_matrix([0x3a ,0xd4 ,0x1e ,0xad ,0xb2 ,0x99 ,0x1a ,0x3c])
print('High')
print_high(gen_lookup_high(Mmatrix, 0x55))
print()
print('Low for AMD64 wich use Cancel AES 0x63')
print_low(gen_lookup_low(Mmatrix, 0x00))
print()
print('Low for ARM64')
print_low(gen_lookup_low(Mmatrix, 0x32^0x55))

结果:

High
0x55 ,0xBA ,0xCC ,0x23 ,0x15 ,0xFA ,0x8C ,0x63 ,0x09 ,0xE6 ,0x90 ,0x7F ,0x49 ,0xA6 ,0xD0 ,0x3F ,

Low for AMD64 wich use Cancel AES 0x63
0x00 ,0x14 ,0xAA ,0xBE ,0x71 ,0x65 ,0xDB ,0xCF ,0xB7 ,0xA3 ,0x1D ,0x09 ,0xC6 ,0xD2 ,0x6C ,0x78 ,

Low for ARM64
0x67 ,0x73 ,0xCD ,0xD9 ,0x16 ,0x02 ,0xBC ,0xA8 ,0xD0 ,0xC4 ,0x7A ,0x6E ,0xA1 ,0xB5 ,0x0B ,0x1F ,

当然,ARM64的外层查找表也可以写成:

Mmatrix = to_matrix([0x3a ,0xd4 ,0x1e ,0xad ,0xb2 ,0x99 ,0x1a ,0x3c])
print('High')
print_high(gen_lookup_high(Mmatrix, 0x00))
print()
print('Low for ARM64')
print_low(gen_lookup_low(Mmatrix, 0x32))

结果:

High
0x00 ,0xEF ,0x99 ,0x76 ,0x40 ,0xAF ,0xD9 ,0x36 ,0x5C ,0xB3 ,0xC5 ,0x2A ,0x1C ,0xF3 ,0x85 ,0x6A ,

Low for ARM64
0x32 ,0x26 ,0x98 ,0x8C ,0x43 ,0x57 ,0xE9 ,0xFD ,0x85 ,0x91 ,0x2F ,0x3B ,0xF4 ,0xE0 ,0x5E ,0x4A ,

内层查找表:

Mmatrix = to_matrix([0x96 ,0x50 ,0x48 ,0xd4 ,0xe4 ,0xdc ,0x06 ,0x11])
print('High')
print_high(gen_lookup_high(Mmatrix, 0x00))
print()
print('Low')
print_low(gen_lookup_low(Mmatrix, 0x00))

结果:

High
0x00 ,0xD5 ,0x08 ,0xDD ,0x7C ,0xA9 ,0x74 ,0xA1 ,0x9C ,0x49 ,0x94 ,0x41 ,0xE0 ,0x35 ,0xE8 ,0x3D ,

Low
0x00 ,0x01 ,0x82 ,0x83 ,0x9E ,0x9F ,0x1C ,0x1D ,0x24 ,0x25 ,0xA6 ,0xA7 ,0xBA ,0xBB ,0x38 ,0x39 ,

AMD64 SHLD SHRD的性能

经过测试,SHLD/SHRD的性能还不如目前的多条指令实现。

	BYTE $0x41; BYTE $0x0F; BYTE $0xA4; BYTE $0xC3; BYTE $0x10 \ // SHLDL(BRC_X0, AX, $16)
	BYTE $0x41; BYTE $0x0F; BYTE $0xA4; BYTE $0xDC; BYTE $0x10 \ // SHLDL(BRC_X1, BX, $16)
	BYTE $0x41; BYTE $0x0F; BYTE $0xA4; BYTE $0xCD; BYTE $0x10 \ // SHLDL(BRC_X2, CX, $16)
	BYTE $0x41; BYTE $0x0F; BYTE $0xA4; BYTE $0xD6; BYTE $0x10

参考:

  1. zuc sbox with aesni, This is the pure golang code to study ZUC implementation with AESENCLAST/AESE instruction.
  2. Faster 128-EEA3 and 128-EIA3 Software, Delayed modular reduction & Carryless multiplication
  3. Efficient Software Implementations of ZUC-256, 这篇文章有ZUC S0 和 S1 的较详细介绍。
  4. Analyzing SNOW and ZUC Security Algorithms Using NIST SP 800-22 and Enhancing their Randomness
  5. Intel(R) Multi-Buffer Crypto for IPsec Library,在Intel CPU架构实现所有优化。
  6. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode