diff --git a/include/gmssl/sm4_cl.h b/include/gmssl/sm4_cl.h index 1394e2ef3..c3a5fe641 100644 --- a/include/gmssl/sm4_cl.h +++ b/include/gmssl/sm4_cl.h @@ -28,7 +28,7 @@ extern "C" { typedef struct { uint32_t rk[32]; - size_t workgroup_size; + //size_t workgroup_size; cl_context context; cl_command_queue queue; cl_program program; @@ -40,7 +40,7 @@ typedef struct { int sm4_cl_set_encrypt_key(SM4_CL_CTX *ctx, const uint8_t key[16]); int sm4_cl_set_decrypt_key(SM4_CL_CTX *ctx, const uint8_t key[16]); -int sm4_cl_encrypt(SM4_CL_CTX *ctx, const uint8_t *in, size_t nblocks, uint8_t *out); +int sm4_cl_ctr32_encrypt(SM4_CL_CTX *ctx, uint8_t iv[16], const uint8_t *in, size_t nblocks, uint8_t *out); void sm4_cl_cleanup(SM4_CL_CTX *ctx); diff --git a/src/sm4_cl.c b/src/sm4_cl.c index 02ed565e9..64dbfdcf3 100644 --- a/src/sm4_cl.c +++ b/src/sm4_cl.c @@ -13,6 +13,7 @@ #include #include #include +#include #include @@ -197,15 +198,18 @@ static int sm4_cl_set_key(SM4_CL_CTX *ctx, const uint8_t key[16], int enc) free(log); goto end; } - if (!(ctx->kernel = clCreateKernel(ctx->program, "sm4_encrypt", &err))) { + if (!(ctx->kernel = clCreateKernel(ctx->program, "sm4_ctr32_encrypt", &err))) { cl_error_print(err); goto end; } + /* + // Apple M2 the CL_KERNEL_WORK_GROUP_SIZE is 256, but the valid work_group_size is 32 if ((err = clGetKernelWorkGroupInfo(ctx->kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(ctx->workgroup_size), &ctx->workgroup_size, NULL)) != CL_SUCCESS) { cl_error_print(err); goto end; } + */ if (enc) { sm4_set_encrypt_key((SM4_KEY *)ctx->rk, key); @@ -239,51 +243,71 @@ int sm4_cl_set_decrypt_key(SM4_CL_CTX *ctx, const uint8_t key[16]) return sm4_cl_set_key(ctx, key, 0); } -int sm4_cl_encrypt(SM4_CL_CTX *ctx, const uint8_t *in, size_t nblocks, uint8_t *out) +int sm4_cl_ctr32_encrypt(SM4_CL_CTX *ctx, uint8_t iv[16], const uint8_t *in, size_t nblocks, uint8_t *out) { int ret = -1; - cl_mem mem; cl_int err; - size_t len = 16 * nblocks; + uint32_t ctr[4]; + cl_mem mem_ctr = NULL; + size_t inlen = SM4_BLOCK_SIZE * nblocks; + cl_mem mem_buf = NULL; cl_uint dim = 1; size_t global_work_size = nblocks; - size_t local_work_size = 32; //ctx->workgroup_size; - void *p; + size_t local_work_size = 32; - if (out != in) - memcpy(out, in, len); + if (global_work_size % local_work_size) { + error_print(); + return -1; + } + + ctr[0] = GETU32(iv); + ctr[1] = GETU32(iv + 4); + ctr[2] = GETU32(iv + 8); + ctr[3] = GETU32(iv + 12); - if (!(mem = clCreateBuffer(ctx->context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, len, out, &err))) { + if (!(mem_ctr = clCreateBuffer(ctx->context, CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR, sizeof(ctr), ctr, &err))) { cl_error_print(err); return -1; } - if ((err = clSetKernelArg(ctx->kernel, 1, sizeof(cl_mem), &mem)) != CL_SUCCESS) { + if ((err = clSetKernelArg(ctx->kernel, 1, sizeof(cl_mem), &mem_ctr)) != CL_SUCCESS) { cl_error_print(err); goto end; } + + if (out != in) { + memcpy(out, in, inlen); + } + if (!(mem_buf = clCreateBuffer(ctx->context, CL_MEM_READ_WRITE|CL_MEM_USE_HOST_PTR, inlen, out, &err))) { + cl_error_print(err); + return -1; + } + if ((err = clSetKernelArg(ctx->kernel, 2, sizeof(cl_mem), &mem_buf)) != CL_SUCCESS) { + cl_error_print(err); + goto end; + } + // on Apple M2, CL_KERNEL_WORK_GROUP_SIZE = 256 // but kernel will fail when local_work_size > 32. + // local_work_size might be restricted by the resources the kernel used. if ((err = clEnqueueNDRangeKernel(ctx->queue, ctx->kernel, dim, NULL, &global_work_size, &local_work_size, 0, NULL, NULL)) != CL_SUCCESS) { cl_error_print(err); goto end; } - if (!(p = clEnqueueMapBuffer(ctx->queue, mem, CL_TRUE, 0, 0, len, 0, NULL, NULL, &err))) { + if (!clEnqueueMapBuffer(ctx->queue, mem_buf, CL_TRUE, 0, 0, inlen, 0, NULL, NULL, &err)) { cl_error_print(err); goto end; } - if (p != out) { - fprintf(stderr, "%s %d: shit\n", __FILE__, __LINE__); - goto end; - } + + ctr[3] += (uint32_t)nblocks; + PUTU32(iv + 12, ctr[3]); ret = 1; end: - clReleaseMemObject(mem); + if (mem_ctr) clReleaseMemObject(mem_ctr); + if (mem_buf) clReleaseMemObject(mem_buf); return ret; } - - #define KERNEL(...) #__VA_ARGS__ static const char *sm4_cl_src = KERNEL( @@ -306,16 +330,14 @@ __constant unsigned char SBOX[256] = { 0x18, 0xf0, 0x7d, 0xec, 0x3a, 0xdc, 0x4d, 0x20, 0x79, 0xee, 0x5f, 0x3e, 0xd7, 0xcb, 0x39, 0x48, }; -__kernel void sm4_encrypt(__global const unsigned int *rkey, __global unsigned char *data) +__kernel void sm4_ctr32_encrypt(__global const unsigned int *rkey, __global const unsigned int *ctr, __global unsigned char *data) { __local unsigned char S[256]; __local unsigned int rk[32]; unsigned int x0, x1, x2, x3, x4, i, t; uint global_id = get_global_id(0); - __global unsigned char *p = data + 16 * global_id; - __global unsigned int *in = (__global unsigned int *)p; - __global unsigned int *out = (__global unsigned int *)p; + __global unsigned int *out = (__global unsigned int *)(data + 16 * global_id); if (get_local_id(0) == 0) { for (i = 0; i < 256; i++) { @@ -326,10 +348,10 @@ __kernel void sm4_encrypt(__global const unsigned int *rkey, __global unsigned c } } - x0 = (in[0] >> 24) | ((in[0] >> 8) & 0xff00) | ((in[0] << 8) & 0xff0000) | (in[0] << 24); - x1 = (in[1] >> 24) | ((in[1] >> 8) & 0xff00) | ((in[1] << 8) & 0xff0000) | (in[1] << 24); - x2 = (in[2] >> 24) | ((in[2] >> 8) & 0xff00) | ((in[2] << 8) & 0xff0000) | (in[2] << 24); - x3 = (in[3] >> 24) | ((in[3] >> 8) & 0xff00) | ((in[3] << 8) & 0xff0000) | (in[3] << 24); + x0 = ctr[0]; + x1 = ctr[1]; + x2 = ctr[2]; + x3 = ctr[3] + global_id; for (i = 0; i < 31; i++) { x4 = x1 ^ x2 ^ x3 ^ rk[i]; @@ -357,10 +379,10 @@ __kernel void sm4_encrypt(__global const unsigned int *rkey, __global unsigned c ((x4 << 18) | (x4 >> (32 - 18))) ^ ((x4 << 24) | (x4 >> (32 - 24)))); - out[0] = (x4 >> 24) | ((x4 >> 8) & 0xff00) | ((x4 << 8) & 0xff0000) | (x4 << 24); - out[1] = (x3 >> 24) | ((x3 >> 8) & 0xff00) | ((x3 << 8) & 0xff0000) | (x3 << 24); - out[2] = (x2 >> 24) | ((x2 >> 8) & 0xff00) | ((x2 << 8) & 0xff0000) | (x2 << 24); - out[3] = (x1 >> 24) | ((x1 >> 8) & 0xff00) | ((x1 << 8) & 0xff0000) | (x1 << 24); + out[0] ^= (x4 >> 24) | ((x4 >> 8) & 0xff00) | ((x4 << 8) & 0xff0000) | (x4 << 24); + out[1] ^= (x3 >> 24) | ((x3 >> 8) & 0xff00) | ((x3 << 8) & 0xff0000) | (x3 << 24); + out[2] ^= (x2 >> 24) | ((x2 >> 8) & 0xff00) | ((x2 << 8) & 0xff0000) | (x2 << 24); + out[3] ^= (x1 >> 24) | ((x1 >> 8) & 0xff00) | ((x1 << 8) & 0xff0000) | (x1 << 24); } ); diff --git a/tests/sm4_cltest.c b/tests/sm4_cltest.c index 34e81a934..af8613257 100644 --- a/tests/sm4_cltest.c +++ b/tests/sm4_cltest.c @@ -18,64 +18,64 @@ #include - -int test_sm4_cl(void) +static int test_sm4_cl_ctr32_encrypt(void) { - const uint8_t key[16] = { - 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, - 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10, - }; - const uint8_t plaintext[16] = { - 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, - 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10, - }; - const uint8_t ciphertext[16] = { - 0x68, 0x1e, 0xdf, 0x34, 0xd2, 0x06, 0x96, 0x5e, - 0x86, 0xb3, 0xe9, 0x4f, 0x53, 0x6e, 0x42, 0x46, - }; + const char *key_hex = "0123456789abcdeffedcba9876543210"; + const char *iv_hex = "0123456789abcdeffedcba9876543210"; + const char *plain_hex = "aaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbccccccccccccccccddddddddddddddddeeeeeeeeeeeeeeeeffffffffffffffffeeeeeeeeeeeeeeeeaaaaaaaaaaaaaaaa"; + const char *cipher_hex = "c2b4759e78ac3cf43d0852f4e8d5f9fd7256e8a5fcb65a350ee00630912e44492a0b17e1b85b060d0fba612d8a95831638b361fd5ffacd942f081485a83ca35d"; int ret = -1; SM4_CL_CTX ctx; - size_t nblocks = 1024; + uint8_t key[16]; + uint8_t iv[16]; + uint8_t ctr[16]; + size_t nblocks = 64; uint8_t *buf = NULL; + uint8_t *ciphertext = NULL; + size_t len; size_t i; - if (!(buf = (uint8_t *)malloc(16 * nblocks))) { error_print(); return -1; } - for (i = 0; i < nblocks; i++) { - memcpy(buf + 16 * i, plaintext, 16); + if (!(ciphertext = (uint8_t *)malloc(16 * nblocks))) { + error_print(); + goto end; } + hex_to_bytes(key_hex, strlen(key_hex), key, &len); + hex_to_bytes(iv_hex, strlen(iv_hex), iv, &len); + hex_to_bytes(plain_hex, strlen(plain_hex), buf, &len); + hex_to_bytes(cipher_hex, strlen(cipher_hex), ciphertext, &len); + if (sm4_cl_set_encrypt_key(&ctx, key) != 1) { error_print(); goto end; } - if (sm4_cl_encrypt(&ctx, buf, nblocks, buf) != 1) { + + memcpy(ctr, iv, sizeof(iv)); + if (sm4_cl_ctr32_encrypt(&ctx, ctr, buf, nblocks, buf) != 1) { error_print(); goto end; } - for (i = 0; i < nblocks; i++) { - //fprintf(stderr, "%zu ", i); - //format_bytes(stderr, 0, 0, "ciphertext", buf + 16*i, 16); - if (memcmp(buf + 16 * i, ciphertext, 16) != 0) { - error_print(); - goto end; - } + if (memcmp(buf, ciphertext, len) != 0) { + error_print(); + goto end; } + printf("%s() ok\n", __FUNCTION__); ret = 1; end: - if (buf) free(buf); sm4_cl_cleanup(&ctx); + if (buf) free(buf); + if (ciphertext) free(ciphertext); return ret; } - -int test_sm4_cl_speed(void) +static int test_sm4_cl_ctr32_encrypt_speed(void) { const uint8_t key[16] = { 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, @@ -90,15 +90,16 @@ int test_sm4_cl_speed(void) 0x86, 0xb3, 0xe9, 0x4f, 0x53, 0x6e, 0x42, 0x46, }; + int ret = -1; SM4_CL_CTX ctx; + uint8_t ctr[16]; size_t nblocks = 1024*1024; uint8_t *buf = NULL; clock_t start, end; double seconds; size_t i; - if (!(buf = (uint8_t *)malloc(16 * nblocks))) { error_print(); return -1; @@ -113,19 +114,15 @@ int test_sm4_cl_speed(void) } start = clock(); - if (sm4_cl_encrypt(&ctx, buf, nblocks, buf) != 1) { + if (sm4_cl_ctr32_encrypt(&ctx, ctr, buf, nblocks, buf) != 1) { error_print(); goto end; } end = clock(); seconds = (double)(end - start)/CLOCKS_PER_SEC; - fprintf(stderr, "sm4_cl_encrypt: %f-MiB per seconds\n", 16/seconds); - - - ret = 1; end: if (buf) free(buf); @@ -135,8 +132,8 @@ int test_sm4_cl_speed(void) int main(void) { - if (test_sm4_cl() != 1) goto err; - if (test_sm4_cl_speed() != 1) goto err; + if (test_sm4_cl_ctr32_encrypt() != 1) goto err; + if (test_sm4_cl_ctr32_encrypt_speed() != 1) goto err; printf("%s all tests passed\n", __FILE__); return 0; err: