-
Notifications
You must be signed in to change notification settings - Fork 71
Efficient Software Implementations of ZUC
Sun Yimin edited this page Nov 26, 2024
·
11 revisions
- Keystream generator
- SIMD + AES-NI
- EIA
- Carryless multiplication, 无进位乘法, Use instruction PCLMULQDQ (AMD64)
- Multi-Buffer, 多路并行
from pyfinite import ffield
gen = 0b110001011
F = ffield.FField(8, gen, useLUT=0) # 这里一定要写useLUT=0,不然会出问题。。。
A = [0b01110111, 0b10111011, 0b11011101, 0b11101110, 0b11001011, 0b01101101, 0b00111110, 0b10010111]
def zuc_sbox_gen(x):
'''
输入x,输出S(x)
'''
x_inv = F.Inverse(x)
y = 0
for i, a in enumerate(A):
if(x_inv&(1<<(7-i))):
y ^= a # 若该bit为1,则异或相应列
return y^0x55
def print_table(table):
for i, s in enumerate(table):
print(f'0x%02X'%s,',', end='')
if (i+1) % 16 == 0:
print()
sbox = []
for i in range(256):
if i > 0:
sbox.append(zuc_sbox_gen(i)) # 生成sbox
else:
sbox.append(0x55)
print_table(sbox)
参考aes和sm4s盒复合域实现方法的做法:
假设复合域求逆运算为
得到
只有这种S盒和AES盒构造完全同构的,用AESENCLAST时,使用0x63作为enc_key才有点意义。
from pyfinite import ffield
from pyfinite import genericmatrix
XOR = lambda x,y:x^y
AND = lambda x,y:x&y
DIV = lambda x,y:x
def aes_f():
gen = 0b100011011
return ffield.FField(8, gen, useLUT=0)
def zuc_f():
gen = 0b110001011
return ffield.FField(8, gen, useLUT=0)
aesf = aes_f()
zucf = zuc_f()
def field_pow2(x, F):
return F.Multiply(x, x)
def field_pow3(x, F):
return F.Multiply(x, field_pow2(x, F))
def field_pow4(x, F):
return field_pow2(field_pow2(x, F), F)
def field_pow16(x, F):
return field_pow4(field_pow4(x, F), F)
def get_all_WZY(F):
result_list = []
for i in range(256):
if field_pow2(i, F)^i^1 == 0:
W=i
W_2 = field_pow2(W, F)
N = W_2
for j in range(256):
if field_pow2(j, F)^j^W_2 == 0:
Z = j
Z_4 = field_pow4(Z, F)
u = F.Multiply(field_pow2(N, F), Z)
for k in range(256):
if field_pow2(k, F)^k^u == 0:
Y = k
Y_16 = field_pow16(k, F)
result_list.append([W, W_2, Z, Z_4, Y, Y_16])
return result_list
def gen_X(F, W, W_2, Z, Z_4, Y, Y_16):
W_2_Z_4_Y_16 = F.Multiply(F.Multiply(W_2, Z_4), Y_16)
W_Z_4_Y_16 = F.Multiply(F.Multiply(W, Z_4), Y_16)
W_2_Z_Y_16 = F.Multiply(F.Multiply(W_2, Z), Y_16)
W_Z_Y_16 = F.Multiply(F.Multiply(W, Z), Y_16)
W_2_Z_4_Y = F.Multiply(F.Multiply(W_2, Z_4), Y)
W_Z_4_Y = F.Multiply(F.Multiply(W, Z_4), Y)
W_2_Z_Y = F.Multiply(F.Multiply(W_2, Z), Y)
W_Z_Y = F.Multiply(F.Multiply(W, Z), Y)
return [W_2_Z_4_Y_16, W_Z_4_Y_16, W_2_Z_Y_16, W_Z_Y_16, W_2_Z_4_Y, W_Z_4_Y, W_2_Z_Y, W_Z_Y]
def to_matrix(x):
m = genericmatrix.GenericMatrix(size=(8,8), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
m.SetRow(0, [(x[0] & 0x80) >> 7, (x[1] & 0x80) >> 7, (x[2] & 0x80) >> 7, (x[3] & 0x80) >> 7, (x[4] & 0x80) >> 7, (x[5] & 0x80) >> 7, (x[6] & 0x80) >> 7, (x[7] & 0x80) >> 7])
m.SetRow(1, [(x[0] & 0x40) >> 6, (x[1] & 0x40) >> 6, (x[2] & 0x40) >> 6, (x[3] & 0x40) >> 6, (x[4] & 0x40) >> 6, (x[5] & 0x40) >> 6, (x[6] & 0x40) >> 6, (x[7] & 0x40) >> 6])
m.SetRow(2, [(x[0] & 0x20) >> 5, (x[1] & 0x20) >> 5, (x[2] & 0x20) >> 5, (x[3] & 0x20) >> 5, (x[4] & 0x20) >> 5, (x[5] & 0x20) >> 5, (x[6] & 0x20) >> 5, (x[7] & 0x20) >> 5])
m.SetRow(3, [(x[0] & 0x10) >> 4, (x[1] & 0x10) >> 4, (x[2] & 0x10) >> 4, (x[3] & 0x10) >> 4, (x[4] & 0x10) >> 4, (x[5] & 0x10) >> 4, (x[6] & 0x10) >> 4, (x[7] & 0x10) >> 4])
m.SetRow(4, [(x[0] & 0x08) >> 3, (x[1] & 0x08) >> 3, (x[2] & 0x08) >> 3, (x[3] & 0x08) >> 3, (x[4] & 0x08) >> 3, (x[5] & 0x08) >> 3, (x[6] & 0x08) >> 3, (x[7] & 0x08) >> 3])
m.SetRow(5, [(x[0] & 0x04) >> 2, (x[1] & 0x04) >> 2, (x[2] & 0x04) >> 2, (x[3] & 0x04) >> 2, (x[4] & 0x04) >> 2, (x[5] & 0x04) >> 2, (x[6] & 0x04) >> 2, (x[7] & 0x04) >> 2])
m.SetRow(6, [(x[0] & 0x02) >> 1, (x[1] & 0x02) >> 1, (x[2] & 0x02) >> 1, (x[3] & 0x02) >> 1, (x[4] & 0x02) >> 1, (x[5] & 0x02) >> 1, (x[6] & 0x02) >> 1, (x[7] & 0x02) >> 1])
m.SetRow(7, [(x[0] & 0x01) >> 0, (x[1] & 0x01) >> 0, (x[2] & 0x01) >> 0, (x[3] & 0x01) >> 0, (x[4] & 0x01) >> 0, (x[5] & 0x01) >> 0, (x[6] & 0x01) >> 0, (x[7] & 0x01) >> 0])
return m
def matrix_col_byte(c):
return (c[0] << 7) ^ (c[1] << 6) ^ (c[2] << 5) ^ (c[3] << 4) ^ (c[4] << 3) ^ (c[5] << 2) ^ (c[6] << 1) ^ (c[7] << 0)
def matrix_row_byte(c):
return (c[0] << 7) ^ (c[1] << 6) ^ (c[2] << 5) ^ (c[3] << 4) ^ (c[4] << 3) ^ (c[5] << 2) ^ (c[6] << 1) ^ (c[7] << 0)
def matrix_cols(m):
x = []
for i in range(8):
c = m.GetColumn(i)
x.append(matrix_col_byte(c))
return x
def matrix_rows(m):
x = []
for i in range(8):
r = m.GetRow(i)
x.append(matrix_row_byte(r))
return x
def gen_X_inv(x):
m = to_matrix(x)
m_inv = m.Inverse()
return matrix_cols(m_inv)
def G4_mul(x, y):
'''
GF(2^2) multiply operator, normal basis is {W^2, W}
'''
a = (x & 0x02) >> 1
b = x & 0x01
c = (y & 0x02) >> 1
d = y & 0x01
e = (a ^ b) & (c ^ d)
return (((a & c) ^ e) << 1) | ((b & d) ^ e)
def G4_mul_N(x):
'''
GF(2^2) multiply N, normal basis is {W^2, W}, N = W^2
'''
a = (x & 0x02) >> 1
b = x & 0x01
p = b
q = a ^ b
return (p << 1) | q
def G4_mul_N2(x):
'''
GF(2^2) multiply N^2, normal basis is {W^2, W}, N = W^2
'''
a = (x & 0x02) >> 1
b = x & 0x01
return ((a ^ b) << 1) | a
def G4_inv(x):
'''
GF(2^2) inverse opertor
'''
a = (x & 0x02) >> 1
b = x & 0x01
return (b << 1) | a
def G16_mul(x, y):
'''
GF(2^4) multiply operator, normal basis is {Z^4, Z}
'''
a = (x & 0xc) >> 2
b = x & 0x03
c = (y & 0xc) >> 2
d = y & 0x03
e = G4_mul(a ^ b, c ^ d)
e = G4_mul_N(e)
p = G4_mul(a, c) ^ e
q = G4_mul(b, d) ^ e
return (p << 2) | q
def G16_sq_mul_u(x):
'''
GF(2^4) x^2 * u operator, u = N^2 Z, N = W^2
'''
a = (x & 0xc) >> 2
b = x & 0x03
p = G4_inv(a ^ b)
q = G4_mul_N2(G4_inv(b))
return (p << 2) | q
def G16_inv(x):
'''
GF(2^4) inverse opertor
'''
a = (x & 0xc) >> 2
b = x & 0x03
c = G4_mul_N(G4_inv(a ^ b))
d = G4_mul(a, b)
e = G4_inv(c ^ d)
p = G4_mul(e, b)
q = G4_mul(e, a)
return (p << 2) | q
def G256_inv(x):
'''
GF(2^8) inverse opertor
'''
a = (x & 0xf0) >> 4
b = x & 0x0f
c = G16_sq_mul_u(a ^ b)
d = G16_mul(a, b)
e = G16_inv(c ^ d)
p = G16_mul(e, b)
q = G16_mul(e, a)
return (p << 4) | q
def G256_new_basis(x, b):
'''
x presentation under new basis b
'''
y = 0
for i in range(8):
if x & (1<<((7-i))):
y ^= b[i]
return y
AES_A = [0b10001111, 0b11000111, 0b11100011, 0b11110001, 0b11111000, 0b01111100, 0b00111110, 0b00011111]
AES_C = [0, 1, 1, 0, 0, 0, 1, 1]
def AES_SBOX(X, X_inv):
sbox = []
for i in range(256):
t = G256_new_basis(i, X_inv)
t = G256_inv(t)
t = G256_new_basis(t, X)
t = G256_new_basis(t, AES_A)
sbox.append(t ^ 0x63)
return sbox
def print_sbox(sbox):
for i, s in enumerate(sbox):
print(f'%02x'%s,',', end='')
if (i+1) % 16 == 0:
print()
def print_all_aes_sbox():
result_list = get_all_WZY(aesf)
for i, v in enumerate(result_list):
X = gen_X(aesf, v[0], v[1], v[2], v[3], v[4], v[5])
X_inv = gen_X_inv(X)
print_sbox(AES_SBOX(X, X_inv))
print()
ZUC_A = [0b01110111, 0b10111011, 0b11011101, 0b11101110, 0b11001011, 0b01101101, 0b00111110, 0b10010111]
ZUC_C = [0, 1, 0, 1, 0, 1, 0, 1]
def ZUC_SBOX(X, X_inv):
sbox = []
for i in range(256):
t = G256_new_basis(i, X_inv)
t = G256_inv(t)
t = G256_new_basis(t, X)
t = G256_new_basis(t, ZUC_A)
sbox.append(t ^ 0x55)
return sbox
def print_all_zuc_sbox():
result_list = get_all_WZY(zucf)
for i, v in enumerate(result_list):
X = gen_X(zucf, v[0], v[1], v[2], v[3], v[4], v[5])
X_inv = gen_X_inv(X)
print_sbox(ZUC_SBOX(X, X_inv))
print()
def print_m(m):
for i, s in enumerate(m):
print(f'0x%02x'%s,',', end='')
def gen_all_m1_c1_m2_c2():
aes_result_list = get_all_WZY(aesf)
zuc_result_list = get_all_WZY(zucf)
Aaes = to_matrix(AES_A)
Aaes_inv = Aaes.Inverse()
Azuc = to_matrix(ZUC_A)
Caes = genericmatrix.GenericMatrix(size=(8, 1), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
for i in range(8):
Caes.SetRow(i, [AES_C[i]])
Czuc = genericmatrix.GenericMatrix(size=(8, 1), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
for i in range(8):
Czuc.SetRow(i, [ZUC_C[i]])
for i, v1 in enumerate(aes_result_list):
Xaes = to_matrix(gen_X(aesf, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]))
Xaes_inv = Xaes.Inverse()
for j, v2 in enumerate(zuc_result_list):
Xzuc = to_matrix(gen_X(zucf, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5]))
Xzuc_inv = Xzuc.Inverse()
M1 = Xaes * Xzuc_inv
M2 = Azuc * Xzuc * Xaes_inv * Aaes_inv
C2 = M2 * Caes
print(f'M1=','', end='')
print_m(matrix_rows(M1))
print(f' C1=','', end='')
print(hex(0x0))
print(f'M2=','', end='')
print_m(matrix_rows(M2))
print(f' C2=','', end='')
print(hex(0x55 ^ matrix_col_byte(C2.GetColumn(0))))
print()
gen_all_m1_c1_m2_c2()
结果:
M1= 0x28 ,0x58 ,0xf6 ,0x76 ,0x8a ,0x40 ,0x3e ,0xf3 , C1= 0x0
M2= 0x81 ,0xfd ,0x57 ,0x8e ,0xdb ,0x6d ,0xf6 ,0x2e , C2= 0xab
M1= 0x3c ,0xaa ,0xe2 ,0x90 ,0xb2 ,0x78 ,0x3e ,0x2b , C1= 0x0
M2= 0x0e ,0x43 ,0x91 ,0x08 ,0xa3 ,0x93 ,0x70 ,0x6e , C2= 0xbc
M1= 0xc6 ,0xac ,0x18 ,0x9e ,0x5a ,0x4e ,0x12 ,0x95 , C1= 0x0
M2= 0x01 ,0x5d ,0x26 ,0x88 ,0xcc ,0xb3 ,0x36 ,0x96 , C2= 0xd8
M1= 0x0c ,0x5e ,0xd2 ,0xa6 ,0xbc ,0xa8 ,0x12 ,0xbf , C1= 0x0
M2= 0x87 ,0x25 ,0xe0 ,0x07 ,0x72 ,0x82 ,0xb9 ,0xdf , C2= 0x58
M1= 0x70 ,0x7c ,0xae ,0x1e ,0xf0 ,0xc8 ,0x06 ,0xdd , C1= 0x0
M2= 0x02 ,0xa5 ,0xd8 ,0x5a ,0x05 ,0xd9 ,0xed ,0x0d , C2= 0xfe
//Intel也用了这组
M1= 0x96 ,0x50 ,0x48 ,0xd4 ,0xe4 ,0xdc ,0x06 ,0x11 , C1= 0x0
M2= 0x3a ,0xd4 ,0x1e ,0xad ,0xb2 ,0x99 ,0x1a ,0x3c , C2= 0x32
M1= 0x52 ,0x6e ,0x8c ,0x02 ,0x26 ,0xc0 ,0xf4 ,0x47 , C1= 0x0
M2= 0x95 ,0x45 ,0x66 ,0xf5 ,0x9d ,0xe7 ,0x84 ,0x15 , C2= 0xec
M1= 0x6a ,0x42 ,0xb4 ,0x16 ,0xec ,0x0a ,0xf4 ,0xa7 , C1= 0x0
M2= 0x62 ,0xf2 ,0xa0 ,0xcd ,0xec ,0xae ,0xbc ,0xeb , C2= 0xb7
from pyfinite import genericmatrix
def XOR(x, y): return x ^ y
def AND(x, y): return x & y
def DIV(x, y): return x
def genCMatrix(c):
Imatrix = genericmatrix.GenericMatrix(size=(8, 1), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
for j in range (8):
Imatrix.SetRow(j, [(0x63 >> (7 - j)) & 1])
return Imatrix
def matrix_from_cols(cols):
m = genericmatrix.GenericMatrix(size=(8, 8), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
for i in range (8):
k = 7 - i
j = 1 << k
m.SetRow(i, [(cols[0] & j) >> k, (cols[1] & j) >> k, (cols[2] & j) >> k, (cols[3] & j) >> k, (cols[4] & j) >> k, (cols[5] & j) >> k, (cols[6] & j) >> k, (cols[7] & j) >> k])
return m
def gen_matrix_based_table(table):
return matrix_from_cols([table[0x80] ^ table[0], table[0x40] ^ table[0], table[0x20] ^ table[0], table[0x10] ^ table[0], table[0x08] ^ table[0], table[0x04] ^ table[0], table[0x02] ^ table[0], table[0x01] ^ table[0]])
def gen_matrix_based_high_low(high, low):
table = []
for i in range(16):
for j in range(16):
table.append(high[i] ^ low[j])
return gen_matrix_based_table(table)
def matrix_col_byte(c):
return (c[0] << 7) ^ (c[1] << 6) ^ (c[2] << 5) ^ (c[3] << 4) ^ (c[4] << 3) ^ (c[5] << 2) ^ (c[6] << 1) ^ (c[7] << 0)
def gen_lookup(m, c):
table = []
for i in range(256):
Imatrix = genericmatrix.GenericMatrix(size=(8, 1), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
for j in range (8):
Imatrix.SetRow(j, [(i >> (7 - j)) & 1])
tmp = m * Imatrix
table.append(matrix_col_byte(tmp.GetColumn(0)) ^ c)
return table
def gen_lookup_low(m, c):
table = []
for i in range(256):
Imatrix = genericmatrix.GenericMatrix(size=(8, 1), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
for j in range (8):
if j < 4:
Imatrix.SetRow(j, [0])
else:
Imatrix.SetRow(j, [(i >> (7 - j)) & 1])
tmp = m * Imatrix
table.append(matrix_col_byte(tmp.GetColumn(0)) ^ c)
return table
def gen_lookup_high(m, c):
table = []
for i in range(256):
Imatrix = genericmatrix.GenericMatrix(size=(8, 1), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
for j in range (8):
if j < 4:
Imatrix.SetRow(j, [(i >> (7 - j)) & 1])
else:
Imatrix.SetRow(j, [0])
tmp = m * Imatrix
table.append(matrix_col_byte(tmp.GetColumn(0)) ^ c)
return table
def print_table(table):
for i, s in enumerate(table):
print(f'0x%02X'%s,',', end='')
if (i+1) % 16 == 0:
print()
def print_high(table):
for i, s in enumerate(table):
if i % 16 == 0:
print(f'0x%02X'%s,',', end='')
print()
def print_low(table):
for i, s in enumerate(table):
if i < 16:
print(f'0x%02X'%s,',', end='')
print()
def to_matrix(x):
m = genericmatrix.GenericMatrix(size=(8,8), zeroElement=0, identityElement=1, add=XOR, mul=AND, sub=XOR, div=DIV)
for i in range(8):
m.SetRow(i, [(x[i] & 0x80) >> 7, (x[i] & 0x40) >> 6, (x[i] & 0x20) >> 5, (x[i] & 0x10) >> 4, (x[i] & 0x08) >> 3, (x[i] & 0x04) >> 2, (x[i] & 0x02) >> 1, (x[i] & 0x01) >> 0])
return m
def gen_intel_c(m, c):
Cmatrix = genCMatrix(0x63)
c1 = m*Cmatrix
return matrix_col_byte(c1.GetColumn(0)) ^ c
Mmatrix = to_matrix([0x3a ,0xd4 ,0x1e ,0xad ,0xb2 ,0x99 ,0x1a ,0x3c])
print('High')
print_high(gen_lookup_high(Mmatrix, 0x55))
print()
print('Low for AMD64 wich use Cancel AES 0x63')
print_low(gen_lookup_low(Mmatrix, 0x00))
print()
print('Low for ARM64')
print_low(gen_lookup_low(Mmatrix, 0x32^0x55))
结果:
High
0x55 ,0xBA ,0xCC ,0x23 ,0x15 ,0xFA ,0x8C ,0x63 ,0x09 ,0xE6 ,0x90 ,0x7F ,0x49 ,0xA6 ,0xD0 ,0x3F ,
Low for AMD64 wich use Cancel AES 0x63
0x00 ,0x14 ,0xAA ,0xBE ,0x71 ,0x65 ,0xDB ,0xCF ,0xB7 ,0xA3 ,0x1D ,0x09 ,0xC6 ,0xD2 ,0x6C ,0x78 ,
Low for ARM64
0x67 ,0x73 ,0xCD ,0xD9 ,0x16 ,0x02 ,0xBC ,0xA8 ,0xD0 ,0xC4 ,0x7A ,0x6E ,0xA1 ,0xB5 ,0x0B ,0x1F ,
当然,ARM64的外层查找表也可以写成:
Mmatrix = to_matrix([0x3a ,0xd4 ,0x1e ,0xad ,0xb2 ,0x99 ,0x1a ,0x3c])
print('High')
print_high(gen_lookup_high(Mmatrix, 0x00))
print()
print('Low for ARM64')
print_low(gen_lookup_low(Mmatrix, 0x32))
结果:
High
0x00 ,0xEF ,0x99 ,0x76 ,0x40 ,0xAF ,0xD9 ,0x36 ,0x5C ,0xB3 ,0xC5 ,0x2A ,0x1C ,0xF3 ,0x85 ,0x6A ,
Low for ARM64
0x32 ,0x26 ,0x98 ,0x8C ,0x43 ,0x57 ,0xE9 ,0xFD ,0x85 ,0x91 ,0x2F ,0x3B ,0xF4 ,0xE0 ,0x5E ,0x4A ,
内层查找表:
Mmatrix = to_matrix([0x96 ,0x50 ,0x48 ,0xd4 ,0xe4 ,0xdc ,0x06 ,0x11])
print('High')
print_high(gen_lookup_high(Mmatrix, 0x00))
print()
print('Low')
print_low(gen_lookup_low(Mmatrix, 0x00))
结果:
High
0x00 ,0xD5 ,0x08 ,0xDD ,0x7C ,0xA9 ,0x74 ,0xA1 ,0x9C ,0x49 ,0x94 ,0x41 ,0xE0 ,0x35 ,0xE8 ,0x3D ,
Low
0x00 ,0x01 ,0x82 ,0x83 ,0x9E ,0x9F ,0x1C ,0x1D ,0x24 ,0x25 ,0xA6 ,0xA7 ,0xBA ,0xBB ,0x38 ,0x39 ,
经过测试,SHLD/SHRD的性能还不如目前的多条指令实现。
BYTE $0x41; BYTE $0x0F; BYTE $0xA4; BYTE $0xC3; BYTE $0x10 \ // SHLDL(BRC_X0, AX, $16)
BYTE $0x41; BYTE $0x0F; BYTE $0xA4; BYTE $0xDC; BYTE $0x10 \ // SHLDL(BRC_X1, BX, $16)
BYTE $0x41; BYTE $0x0F; BYTE $0xA4; BYTE $0xCD; BYTE $0x10 \ // SHLDL(BRC_X2, CX, $16)
BYTE $0x41; BYTE $0x0F; BYTE $0xA4; BYTE $0xD6; BYTE $0x10
- zuc sbox with aesni, This is the pure golang code to study ZUC implementation with AESENCLAST/AESE instruction.
- Faster 128-EEA3 and 128-EIA3 Software, Delayed modular reduction & Carryless multiplication
- Efficient Software Implementations of ZUC-256, 这篇文章有ZUC S0 和 S1 的较详细介绍。
- Analyzing SNOW and ZUC Security Algorithms Using NIST SP 800-22 and Enhancing their Randomness
- Intel(R) Multi-Buffer Crypto for IPsec Library,在Intel CPU架构实现所有优化。
- Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode