From 89864cd640e1f32cb6fb9ed48fe4154debf2a5b7 Mon Sep 17 00:00:00 2001 From: magnum Date: Thu, 16 Mar 2023 02:08:04 +0100 Subject: [PATCH] NT-OpenCL: Early partial transfer of keybuffer This is well tested code in other formats. About 10% boost on 2080ti, against 5300 hashes and pure wordlist, no mask. Also adds an entry in doc/NEWS. Closes #5245. --- doc/NEWS | 5 +++++ src/opencl_nt_fmt_plug.c | 30 ++++++++++++++++++++++++------ 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/doc/NEWS b/doc/NEWS index 1efc23cc3b0..4d4c985f296 100644 --- a/doc/NEWS +++ b/doc/NEWS @@ -296,6 +296,11 @@ Major changes from 1.9.0-jumbo-1 (May 2019) in this bleeding-edge version: - Added support for cracking SNTP-MS "timeroast". [magnum; 2023] +- Major overhaul of NT-opencl: Performance boost up to 50% depending on GPU. + [magnum; 2023] + +- Add NT-long-opencl (password length of up to 125 bytes). [magnum; 2023] + Major changes from 1.8.0-jumbo-1 (December 2014) to 1.9.0-jumbo-1 (May 2019): diff --git a/src/opencl_nt_fmt_plug.c b/src/opencl_nt_fmt_plug.c index f0ed54b02a8..20223e94d25 100644 --- a/src/opencl_nt_fmt_plug.c +++ b/src/opencl_nt_fmt_plug.c @@ -177,7 +177,9 @@ static cl_mem buffer_keys, buffer_idx, buffer_int_keys, buffer_int_key_loc; static cl_uint *saved_plain, *saved_idx, *saved_int_key_loc; static int static_gpu_locations[MASK_FMT_INT_PLHDR]; -static unsigned int key_idx = 0; +static size_t key_idx; +static size_t key_offset, idx_offset; + static struct fmt_main *self; #define STEP 0 @@ -505,6 +507,8 @@ static int get_hash_6(int index) { return bt_hash_table_64[ocl_hc_hash_ids[3 + 3 static void clear_keys(void) { key_idx = 0; + key_offset = 0; + idx_offset = 0; } static void set_key(char *_key, int index) @@ -535,6 +539,20 @@ static void set_key(char *_key, int index) } if (len) saved_plain[key_idx++] = *key & (0xffffffffU >> (32 - (len << 3))); + + /* Early partial transfer to GPU every 2 MB */ + if (4 * key_idx - key_offset > (2 << 20)) { + HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], buffer_keys, CL_FALSE, key_offset, 4 * key_idx - key_offset, saved_plain + key_offset / 4, 0, NULL, NULL), "failed in clEnqueueWriteBuffer buffer_keys."); + HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], buffer_idx, CL_FALSE, idx_offset, 4 * (index + 1) - idx_offset, saved_idx + idx_offset / 4, 0, NULL, NULL), "failed in clEnqueueWriteBuffer buffer_idx."); + + if (!mask_gpu_is_static) + HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], buffer_int_key_loc, CL_FALSE, idx_offset, 4 * (index + 1) - idx_offset, saved_int_key_loc + (idx_offset / 4), 0, NULL, NULL), "failed in clEnqueueWriteBuffer buffer_int_key_loc."); + + HANDLE_CLERROR(clFlush(queue[gpu_id]), "failed in clFlush"); + + key_offset = 4 * key_idx; + idx_offset = 4 * (index + 1); + } } static char *get_key(int index) @@ -593,13 +611,13 @@ static int crypt_all(int *pcount, struct db_salt *salt) //fprintf(stderr, "%s(%d) lws "Zu" gws "Zu" idx %u int_cand %d\n", __FUNCTION__, count, local_work_size, gws, key_idx, mask_int_cand.num_int_cand); // copy keys to the device - if (key_idx) - BENCH_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], buffer_keys, CL_FALSE, 0, 4 * key_idx, saved_plain, 0, NULL, multi_profilingEvent[0]), "failed in clEnqueueWriteBuffer buffer_keys."); - - BENCH_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], buffer_idx, CL_FALSE, 0, 4 * gws, saved_idx, 0, NULL, multi_profilingEvent[1]), "failed in clEnqueueWriteBuffer buffer_idx."); + if (key_idx) { + BENCH_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], buffer_keys, CL_FALSE, key_offset, 4 * key_idx - key_offset, saved_plain + key_offset / 4, 0, NULL, multi_profilingEvent[0]), "failed in clEnqueueWriteBuffer buffer_keys."); + BENCH_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], buffer_idx, CL_FALSE, idx_offset, 4 * gws - idx_offset, saved_idx + idx_offset / 4, 0, NULL, multi_profilingEvent[1]), "failed in clEnqueueWriteBuffer buffer_idx."); + } if (!mask_gpu_is_static) - BENCH_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], buffer_int_key_loc, CL_FALSE, 0, 4 * gws, saved_int_key_loc, 0, NULL, NULL), "failed in clEnqueueWriteBuffer buffer_int_key_loc."); + BENCH_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], buffer_int_key_loc, CL_FALSE, idx_offset, 4 * gws - idx_offset, saved_int_key_loc + (idx_offset / 4), 0, NULL, NULL), "failed in clEnqueueWriteBuffer buffer_int_key_loc."); return ocl_hc_64_extract_info(salt, set_kernel_args, set_kernel_args_kpc, init_kernel, gws, lws, pcount); }