Skip to content

Commit

Permalink
test warp_histogram
Browse files Browse the repository at this point in the history
  • Loading branch information
FindHao committed May 4, 2023
1 parent c8a6424 commit 9b5d849
Showing 1 changed file with 48 additions and 16 deletions.
64 changes: 48 additions & 16 deletions gpu-patch/src/test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -57,17 +57,19 @@
*/
static __device__ void unfold_records(gpu_patch_buffer_t *patch_buffer, gpu_patch_buffer_t *tmp_buffer)
{
auto warp_index = blockDim.x / GPU_PATCH_WARP_SIZE * blockIdx.x + threadIdx.x / GPU_PATCH_WARP_SIZE;
auto warp_id = blockDim.x / GPU_PATCH_WARP_SIZE * blockIdx.x + threadIdx.x / GPU_PATCH_WARP_SIZE;
// by default it is 4
auto num_warps = blockDim.x / GPU_PATCH_WARP_SIZE;
auto laneid = get_laneid();
const int idx = threadIdx.x + blockDim.x * blockIdx.x;

gpu_patch_record_address_t *records = (gpu_patch_record_address_t *)patch_buffer->records;
gpu_patch_addr_sort_t *addr_hist = (gpu_patch_addr_sort_t *)tmp_buffer->records;
gpu_patch_addr_hist_t *addr_hist = (gpu_patch_addr_hist_t *)tmp_buffer->records;
PRINT("gpu analysis->full: %u, analysis: %u, head_index: %u, tail_index: %u, size: %u, num_threads: %u",
patch_buffer->full, patch_buffer->analysis, patch_buffer->head_index, patch_buffer->tail_index,
patch_buffer->size, patch_buffer->num_threads)
// each warp will take care with one record (32 addresses) in each iteration
for (auto iter = warp_index; iter < patch_buffer->head_index; iter += num_warps)
for (auto iter = warp_id; iter < patch_buffer->head_index; iter += num_warps)
{
gpu_patch_record_address_t *record = records + iter;
uint64_t address = record->address[laneid];
Expand All @@ -76,7 +78,36 @@ static __device__ void unfold_records(gpu_patch_buffer_t *patch_buffer, gpu_patc
{
address = 0;
}
addr_hist[iter * GPU_PATCH_WARP_SIZE + laneid] = address;
// sort addresses of a record inside a warp
address = warp_sort(address, laneid);

uint64_t value = address;
uint64_t prev_value = __shfl_up_sync(0xffffffff, value, 1);
bool is_unique = (laneid == 0) || (value != prev_value);

unsigned int predicate = __ballot_sync(0xffffffff, !is_unique);
uint32_t mask = 0xFFFFFFFF << laneid;
uint32_t predicate2 = predicate & mask;
uint32_t predicate2_reverse = __brev(predicate2);
int leading_zeros_origin = __clz(predicate2_reverse);
// Find the position of the most significant bit (MSB) set to 1
int msb_position = 31 - __clz(predicate2_reverse);
// Create a mask with all bits set to 1 up to and including the MSB
mask = (0xFFFFFFFF >> (31 - msb_position));
// Invert the input number (flip all the bits)
unsigned int inverted_x = ~predicate2_reverse;
// Count the number of leading zeros in the inverted number masked with the mask
int leading_zeros = __clz(inverted_x & mask);
int leading_ones = (predicate & (1 << (laneid + 1))) ? leading_zeros - leading_zeros_origin : 0;
int count = leading_ones + 1;
int unique_count = __ballot_sync(0xffffffff, is_unique);
if (is_unique)
{
int output_idx = iter * GPU_PATCH_WARP_SIZE + __popc(unique_count & ((1 << laneid) - 1));
addr_hist[output_idx].address = value;
addr_hist[output_idx].count = count;
}
int num_unique = __popc(__ballot_sync(0xffffffff, is_unique));
}
tmp_buffer->head_index = patch_buffer->head_index * GPU_PATCH_WARP_SIZE;
}
Expand All @@ -86,9 +117,9 @@ static __device__ void unfold_records(gpu_patch_buffer_t *patch_buffer, gpu_patc

template <int THREADS>
static __device__ void block_radix_sort(
gpu_patch_buffer_t *tmp_buffer,
gpu_patch_addr_sort_t *tmp_buffer_records_g_sorted
) {
gpu_patch_buffer_t *tmp_buffer,
gpu_patch_addr_hist_t *tmp_buffer_records_g_sorted)
{
int num_of_records = tmp_buffer->head_index;
// DEFAULT_GPU_PATCH_RECORD_NUM is 1280*1024 by default. each record includes 32 addresses with uint64_t type. so the total size is 1280*1024*32*8 = 335544320 bytes. Since we have 1024 threads for our analysis kernel, each thread need 335544320/1024/1024 = 320KB memory. The max local memory per thread is 512KB, so we are good for default configuration.
// int items_per_thread = num_of_records / THREADS;
Expand All @@ -112,7 +143,7 @@ extern "C" __launch_bounds__(GPU_PATCH_ANALYSIS_THREADS, 1)
void gpu_analysis_hist(
gpu_patch_buffer_t *buffer,
gpu_patch_buffer_t *tmp_buffer,
gpu_patch_addr_sort_t *tmp_buffer_records_g_sorted
gpu_patch_addr_hist_t *tmp_buffer_records_g_sorted
// gpu_patch_buffer_t *hist_buffer
) {
// // Continue processing until CPU notifies analysis is done
Expand All @@ -134,19 +165,19 @@ extern "C" __launch_bounds__(GPU_PATCH_ANALYSIS_THREADS, 1)
int main(int argc, char **argv)
{
std::cout << "Hello, world!" << std::endl;
int num_records = 2000;
int num_records = 1024;

// tmp_buffer is used to store the unfolded records
gpu_patch_buffer_t *tmp_buffer;
CHECK_CALL(cudaMalloc, ((void **)&tmp_buffer, sizeof(gpu_patch_buffer_t)));
// tmp_buffer_records_g is used to store the unfolded records
void *tmp_buffer_records_g = NULL;
CHECK_CALL(cudaMalloc, ((void **)&tmp_buffer_records_g,
sizeof(gpu_patch_addr_sort_t) * num_records * GPU_PATCH_WARP_SIZE));
sizeof(gpu_patch_addr_hist_t) * num_records * GPU_PATCH_WARP_SIZE));
// tmp_buffer_records_g_sorted is used to store the sorted unfolded records
void * tmp_buffer_records_g_sorted = NULL;
CHECK_CALL(cudaMalloc, ((void **)&tmp_buffer_records_g_sorted,
sizeof(gpu_patch_addr_sort_t) * num_records * GPU_PATCH_WARP_SIZE));
sizeof(gpu_patch_addr_hist_t) * num_records * GPU_PATCH_WARP_SIZE));
// we need to update the records pointer in tmp_buffer by this way. because we can't directly update the records pointer in tmp_buffer on CPU side.
gpu_patch_buffer_t *tmp_buffer_h;
tmp_buffer_h = (gpu_patch_buffer_t *)malloc(sizeof(gpu_patch_buffer_t));
Expand All @@ -171,24 +202,25 @@ int main(int argc, char **argv)
{
for (int j = 0; j < GPU_PATCH_WARP_SIZE; j++)
{
gpu_buffer_records_h[i].address[j] = i % 10;
gpu_buffer_records_h[i].address[j] = j % 10;
gpu_buffer_records_h[i].size = 1;
}
gpu_buffer_records_h[i].active = 0xffffffff;
}
CHECK_CALL(cudaMemcpy, (gpu_buffer, gpu_buffer_h, sizeof(gpu_patch_buffer_t), cudaMemcpyHostToDevice));
CHECK_CALL(cudaMemcpy, (gpu_buffer_records, gpu_buffer_records_h, sizeof(gpu_patch_record_address_t) * num_records, cudaMemcpyHostToDevice));
gpu_analysis_hist<<<1, GPU_PATCH_ANALYSIS_THREADS>>>(gpu_buffer, tmp_buffer, (gpu_patch_addr_sort_t *)tmp_buffer_records_g_sorted);
gpu_analysis_hist<<<1, GPU_PATCH_ANALYSIS_THREADS>>>(gpu_buffer, tmp_buffer, (gpu_patch_addr_hist_t *)tmp_buffer_records_g_sorted);

gpu_patch_addr_sort_t *tmp_buffer_records_h = (gpu_patch_addr_sort_t *)malloc(sizeof(gpu_patch_addr_sort_t) * num_records * GPU_PATCH_WARP_SIZE);
gpu_patch_addr_hist_t *tmp_buffer_records_h = (gpu_patch_addr_hist_t *)malloc(sizeof(gpu_patch_addr_hist_t) * num_records * GPU_PATCH_WARP_SIZE);
// copy the unfolded records from GPU to CPU
CHECK_CALL(cudaMemcpy, (tmp_buffer_records_h, tmp_buffer_records_g, sizeof(gpu_patch_addr_sort_t) * num_records * GPU_PATCH_WARP_SIZE, cudaMemcpyDeviceToHost));
CHECK_CALL(cudaMemcpy, (tmp_buffer_records_h, tmp_buffer_records_g, sizeof(gpu_patch_addr_hist_t) * num_records * GPU_PATCH_WARP_SIZE, cudaMemcpyDeviceToHost));
CHECK_CALL(cudaDeviceSynchronize, ());
std::cout << "unfolded records:" << std::endl;
for (int i = 0; i < num_records; i++)
{
for (int j = 0; j < GPU_PATCH_WARP_SIZE; j++)
{
std::cout << tmp_buffer_records_h[i * GPU_PATCH_WARP_SIZE + j] << " ";
std::cout << tmp_buffer_records_h[i * GPU_PATCH_WARP_SIZE + j].address << ":" << tmp_buffer_records_h[i * GPU_PATCH_WARP_SIZE + j].count << " ";
}
std::cout << std::endl;
}
Expand Down

0 comments on commit 9b5d849

Please sign in to comment.