Skip to content

Commit

Permalink
add the process for inaligned records
Browse files Browse the repository at this point in the history
  • Loading branch information
FindHao committed Jun 9, 2023
1 parent 565526d commit b08490f
Showing 1 changed file with 38 additions and 15 deletions.
53 changes: 38 additions & 15 deletions gpu-patch/src/test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
}

/**
* Each gpu_patch_buffer_t has a pointer to its records, and each records has 32 addresses. This function will unfold this structure into gpu_patch_buffer_t has new records while each record only has one address and its count.
* Each gpu_patch_buffer_t has a pointer to its records, and each records has 32 addresses. This function will unfold this structure into gpu_patch_buffer_t has new records while each record only has one address and its count. Besides the unfolding, this function will also do intra-warp counting.
* @param buffer: the original buffer with a bunch of records
* @param unfolded_buffer: the buffer with unfolded and intra-warp-processed records.
*/
Expand All @@ -69,9 +69,17 @@ static __device__ void unfold_records(gpu_patch_buffer_t *patch_buffer, gpu_patc
patch_buffer->full, patch_buffer->analysis, patch_buffer->head_index, patch_buffer->tail_index,
patch_buffer->size, patch_buffer->num_threads)
int addr_hist_index = 0;
auto iter = warp_id;
int round_head_index = (patch_buffer->head_index + num_warps - 1) / num_warps * num_warps;
// every record has 32 addresses with mask. but the number of record may not be `num_warps` aligned.
// e.g., there are 3 records. we need to let the last warp be inactive.
// each warp will take care with one record (32 addresses) in each iteration
for (auto iter = warp_id; iter < patch_buffer->head_index; iter += num_warps)
for (; iter < round_head_index; iter += num_warps)
{
if (iter >= patch_buffer->head_index)
{
continue;
}
gpu_patch_record_address_t *record = records + iter;
uint64_t address = record->address[laneid];
// if the thread is not active, set the address to 0
Expand Down Expand Up @@ -109,9 +117,11 @@ static __device__ void unfold_records(gpu_patch_buffer_t *patch_buffer, gpu_patc
{
unique_count_shared[warp_id] = __popc(unique_mark);
// unique_count_shared_accumulate[warp_id] = __popc(unique_mark);
if (warp_id == 0){
if (warp_id == 0)
{
int next_start = 0;
for (int i = 0; i < GPU_PATH_ANALYSIS_NUM_WARPS; i++){
for (int i = 0; i < GPU_PATH_ANALYSIS_NUM_WARPS; i++)
{
unique_count_shared_accumulate[i] = next_start;
next_start += unique_count_shared[i];
}
Expand All @@ -127,9 +137,11 @@ static __device__ void unfold_records(gpu_patch_buffer_t *patch_buffer, gpu_patc
addr_hist_count[output_idx] = count;
}
__syncthreads();
if (idx == 0){
if (idx == 0)
{
int all_unique_count = unique_count_shared_accumulate[GPU_PATH_ANALYSIS_NUM_WARPS - 1] + unique_count_shared[GPU_PATH_ANALYSIS_NUM_WARPS - 1];
for (int i = 0; i < all_unique_count; i++){
for (int i = 0; i < all_unique_count; i++)
{
addr_hist[addr_hist_index + i].address = addr_hist_addr[i];
addr_hist[addr_hist_index + i].count = addr_hist_count[i];
}
Expand All @@ -140,13 +152,11 @@ static __device__ void unfold_records(gpu_patch_buffer_t *patch_buffer, gpu_patc
unfolded_buffer->head_index = addr_hist_index;
}

#define ITEMS_PER_THREAD 4

/**
* @brief This function only sorts THREADS * ITEMS_PER_THREAD items in unfolded_buffer->records
* @Yueming TODO: add the histogram part
*/
template <int THREADS>
*/
template <int THREADS, int ITEMS_PER_THREAD>
static __device__ void block_radix_sort_tile(
uint64_t *d_in,
uint64_t *d_out)
Expand All @@ -163,7 +173,14 @@ static __device__ void block_radix_sort_tile(
{
d_out[threadIdx.x * ITEMS_PER_THREAD + i] = keys[i];
}
// maybe we can use the similar code in unfold_records to process all 4 warps in a block.
// maybe we can use the similar code in unfold_records to process all 4 warps in a block.
}

template <int THREADS, int ITEMS_PER_THREAD>
static __device__ void block_radix_sort(
gpu_patch_buffer_t *unfolded_buffer,
gpu_patch_buffer_t *hist_buffer)
{
}

extern "C" __launch_bounds__(GPU_PATCH_ANALYSIS_THREADS, 1)
Expand All @@ -189,13 +206,14 @@ extern "C" __launch_bounds__(GPU_PATCH_ANALYSIS_THREADS, 1)
// }
unfold_records(buffer, unfolded_buffer);
// @Yueming TODO: use a for loop to split the unfolded_buffer into multiple tiles, and use block_radix_sort_tile to process each tile. Add another outside for loop to process at least twice to compress more. Finally, the unfolded_buffer_records_g_sorted will have compressed histogram.
// block_radix_sort<GPU_PATCH_ANALYSIS_THREADS>(unfolded_buffer, unfolded_buffer_records_g_sorted);
// uint32_t tile_size = THREADS * GPU_PATCH_ANALYSIS_THREADS;
// block_radix_sort<GPU_PATCH_ANALYSIS_THREADS, GPU_PATCH_ANALYSIS_ITEMS>(unfolded_buffer, hist_buffer);
}

int main(int argc, char **argv)
{
std::cout << "Hello, world!" << std::endl;
int num_records = 1024;
int num_records = 3;

// unfolded_buffer is used to store the unfolded records
gpu_patch_buffer_t *unfolded_buffer;
Expand Down Expand Up @@ -232,7 +250,8 @@ int main(int argc, char **argv)
{
for (int j = 0; j < GPU_PATCH_WARP_SIZE; j++)
{
gpu_buffer_records_h[i].address[j] = j % 10;
// gpu_buffer_records_h[i].address[j] = j % 10;
gpu_buffer_records_h[i].address[j] = 1;
gpu_buffer_records_h[i].size = 1;
}
gpu_buffer_records_h[i].active = 0xffffffff;
Expand All @@ -244,8 +263,12 @@ int main(int argc, char **argv)
gpu_patch_addr_hist_t *unfolded_buffer_records_h = (gpu_patch_addr_hist_t *)malloc(sizeof(gpu_patch_addr_hist_t) * num_records * GPU_PATCH_WARP_SIZE);
// copy the unfolded records from GPU to CPU
CHECK_CALL(cudaMemcpy, (unfolded_buffer_records_h, unfolded_buffer_records_g, sizeof(gpu_patch_addr_hist_t) * num_records * GPU_PATCH_WARP_SIZE, cudaMemcpyDeviceToHost));
// copy the head_index back to CPU
CHECK_CALL(cudaMemcpy, (unfolded_buffer_h, unfolded_buffer, sizeof(gpu_patch_buffer_t), cudaMemcpyDeviceToHost));
CHECK_CALL(cudaDeviceSynchronize, ());
std::cout << "unfolded records:" << std::endl;
std::cout << std::endl
<< "unfolded records: "
<< "head_index:" << unfolded_buffer_h->head_index << std::endl;
for (int i = 0; i < num_records; i++)
{
for (int j = 0; j < GPU_PATCH_WARP_SIZE; j++)
Expand Down

0 comments on commit b08490f

Please sign in to comment.