Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions clients/benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ cd hipBLASLt; cd build/release
--print_kernel_info Print solution, kernel name and solution index.
--rotating <value> Use rotating memory blocks for each iteration, size in MB. (Default value is: 0)
--use_gpu_timer Use hipEventElapsedTime to profile elapsed time. (Default value is: false)
--uncachedA Allocate un-cached memory for tensor A. (Default value is: false)
--uncachedB Allocate un-cached memory for tensor B. (Default value is: false)
--uncachedC Allocate un-cached memory for tensor C. (Default value is: false)
--uncachedD Allocate un-cached memory for tensor D. (Default value is: false)
--splitk <value> [Tuning parameter] Set split K for a solution, 0 is use solution's default value. (Only support GEMM + api_method mix or cpp)
--wgm <value> [Tuning parameter] Set workgroup mapping for a solution, 0 is use solution's default value. (Only support GEMM + api_method mix or cpp)
--flush Flush icache
Expand Down
16 changes: 16 additions & 0 deletions clients/benchmarks/client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,22 @@ try
value<bool>(&arg.use_gpu_timer)->default_value(false),
"Use hipEventElapsedTime to profile elapsed time.")

("uncachedA",
value<bool>(&arg.uncachedA)->default_value(false),
"Allocate un-cached memory for tensor A.")

("uncachedB",
value<bool>(&arg.uncachedB)->default_value(false),
"Allocate un-cached memory for tensor B.")

("uncachedC",
value<bool>(&arg.uncachedC)->default_value(false),
"Allocate un-cached memory for tensor C.")

("uncachedD",
value<bool>(&arg.uncachedD)->default_value(false),
"Allocate un-cached memory for tensor D.")

("splitk",
valueVec<uint32_t>(&gsu_vector),
"[Tuning parameter] Set split K for a solution, 0 is use solution's default value. (Only support GEMM + api_method mix or cpp)")
Expand Down
4 changes: 4 additions & 0 deletions clients/common/hipblaslt_arguments.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,10 @@ void Arguments::init()
use_user_args = false;
rotating = 0;
use_gpu_timer = false;
uncachedA = false;
uncachedB = false;
uncachedC = false;
uncachedD = false;

// tuning
gsu_vector[0] = 0;
Expand Down
54 changes: 43 additions & 11 deletions clients/include/d_vector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,23 @@
#define MEM_MAX_GUARD_PAD 8192

/* ============================================================================================ */

//!
//! @brief enum to specify the type of allocated device memory.
//!
typedef enum
{
// Normal
HIPBLASLT_DEVICE_MEMORY_NORMAL,

// Managed
HIPBLASLT_DEVICE_MEMORY_MANAGED,

// Un-cached
HIPBLASLT_DEVICE_MEMORY_UNCACHED

} hipblaslt_device_memory_type;

/*! \brief base-class to allocate/deallocate device memory */
template <typename T>
class d_vector
Expand All @@ -54,18 +71,18 @@ class d_vector
}

public:
bool use_HMM = false;
hipblaslt_device_memory_type m_type = HIPBLASLT_DEVICE_MEMORY_NORMAL;

public:
static T m_guard[MEM_MAX_GUARD_PAD];

#ifdef GOOGLE_TEST
d_vector(size_t s, bool HMM = false)
d_vector(size_t s, hipblaslt_device_memory_type memType = HIPBLASLT_DEVICE_MEMORY_NORMAL)
: m_size(s)
, m_pad(std::min(g_DVEC_PAD, size_t(MEM_MAX_GUARD_PAD)))
, m_guard_len(m_pad * sizeof(T))
, m_bytes((s + m_pad * 2) * sizeof(T))
, use_HMM(HMM)
, m_type(memType)
{
// Initialize m_guard with random data
if(!m_init_guard)
Expand All @@ -75,25 +92,40 @@ class d_vector
}
}
#else
d_vector(size_t s, bool HMM = false)
d_vector(size_t s, hipblaslt_device_memory_type memType = HIPBLASLT_DEVICE_MEMORY_NORMAL)
: m_size(s)
, m_pad(0) // save current pad length
, m_guard_len(0 * sizeof(T))
, m_bytes(s ? s * sizeof(T) : sizeof(T))
, use_HMM(HMM)
, m_type(memType)
{
}
#endif

T* device_vector_setup()
{
T* d = nullptr;
if(use_HMM ? hipMallocManaged(&d, m_bytes) : (hipMalloc)(&d, m_bytes) != hipSuccess)
{
hipblaslt_cerr << "Error allocating " << m_bytes << " m_bytes (" << (m_bytes >> 30)
<< " GB)" << std::endl;

d = nullptr;
switch(m_type){
case HIPBLASLT_DEVICE_MEMORY_NORMAL:
if((hipMalloc)(&d, m_bytes) != hipSuccess)
d = nullptr;
break;
case HIPBLASLT_DEVICE_MEMORY_MANAGED:
if(hipMallocManaged(&d, m_bytes) != hipSuccess)
d = nullptr;
break;
case HIPBLASLT_DEVICE_MEMORY_UNCACHED:
if(hipExtMallocWithFlags((void**)&d, m_bytes, hipDeviceMallocUncached) != hipSuccess)
d = nullptr;
break;
default:
hipblaslt_cerr << "Wrong device memory type: " << m_type << std::endl;
return nullptr;
break;
}
if(d == nullptr){
hipblaslt_cerr << "Error allocating un-cached memory " << m_bytes << " m_bytes (" << (m_bytes >> 30)
<< " GB)" << std::endl;
}
#ifdef GOOGLE_TEST
else
Expand Down
9 changes: 4 additions & 5 deletions clients/include/device_vector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,9 @@ class device_vector : public d_vector<T>
//! @brief Constructor.
//! @param n The length of the vector.
//! @param inc The increment.
//! @param HMM HipManagedMemory Flag.
//!
explicit device_vector(size_t n, int64_t inc = 1, bool HMM = false)
: d_vector<T>{n * std::abs(inc), HMM}
//! @param memType Memory type enum.
explicit device_vector(size_t n, int64_t inc = 1, hipblaslt_device_memory_type memType = HIPBLASLT_DEVICE_MEMORY_NORMAL)
: d_vector<T>{n * std::abs(inc), memType}
, m_n{n}
, m_inc{inc}
, m_data{this->device_vector_setup()}
Expand Down Expand Up @@ -136,7 +135,7 @@ class device_vector : public d_vector<T>
= hipMemcpy(m_data + i_block * this->nmemb() / block_count,
(const T*)that,
this->nmemb() * sizeof(T) / block_count,
this->use_HMM ? hipMemcpyHostToHost : hipMemcpyHostToDevice);
this->m_type == HIPBLASLT_DEVICE_MEMORY_MANAGED ? hipMemcpyHostToHost : hipMemcpyHostToDevice);
if(status != hipSuccess)
{
return status;
Expand Down
8 changes: 8 additions & 0 deletions clients/include/hipblaslt_arguments.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,10 @@ struct Arguments
bool use_user_args;
int32_t rotating;
bool use_gpu_timer;
bool uncachedA;
bool uncachedB;
bool uncachedC;
bool uncachedD;

// tuning
int32_t gsu_vector[MAX_SUPPORTED_NUM_PROBLEMS]; // This is for client
Expand Down Expand Up @@ -260,6 +264,10 @@ struct Arguments
OPER(use_user_args) SEP \
OPER(rotating) SEP \
OPER(use_gpu_timer) SEP \
OPER(uncachedA) SEP \
OPER(uncachedB) SEP \
OPER(uncachedC) SEP \
OPER(uncachedD) SEP \
OPER(gsu_vector) SEP \
OPER(wgm_vector) SEP \
OPER(print_solution_found) SEP \
Expand Down
8 changes: 8 additions & 0 deletions clients/include/hipblaslt_common.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,10 @@ Arguments:
- use_user_args: c_bool
- rotating: c_int32
- use_gpu_timer: c_bool
- uncachedA: c_bool
- uncachedB: c_bool
- uncachedC: c_bool
- uncachedD: c_bool
- gsu_vector: c_int32*32
- wgm_vector: c_int32*32
- print_solution_found: c_bool
Expand Down Expand Up @@ -386,6 +390,10 @@ Defaults:
use_user_args: false
rotating: 0
use_gpu_timer: false
uncachedA: false
uncachedB: false
uncachedC: false
uncachedD: false
gsu_vector: 0
wgm_vector: 0
print_solution_found: false
Expand Down
4 changes: 2 additions & 2 deletions clients/include/host_vector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,13 +89,13 @@ struct host_vector : std::vector<T>
{
hipError_t hip_err;

if(that.use_HMM && hipSuccess != (hip_err = hipDeviceSynchronize()))
if(that.m_type == HIPBLASLT_DEVICE_MEMORY_MANAGED && hipSuccess != (hip_err = hipDeviceSynchronize()))
return hip_err;

return hipMemcpy(*this,
that,
sizeof(T) * this->size(),
that.use_HMM ? hipMemcpyHostToHost : hipMemcpyDeviceToHost);
that.m_type == HIPBLASLT_DEVICE_MEMORY_MANAGED ? hipMemcpyHostToHost : hipMemcpyDeviceToHost);
}

//!
Expand Down
34 changes: 19 additions & 15 deletions clients/include/testing_matmul.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -665,7 +665,11 @@ void testing_matmul_with_bias(const Arguments& arg)
{
double gpu_time_used, cpu_time_used;
gpu_time_used = cpu_time_used = 0.0;
bool HMM = arg.HMM;
hipblaslt_device_memory_type memType = arg.HMM? HIPBLASLT_DEVICE_MEMORY_MANAGED : HIPBLASLT_DEVICE_MEMORY_NORMAL;
hipblaslt_device_memory_type memTypeA = arg.uncachedA? HIPBLASLT_DEVICE_MEMORY_UNCACHED : memType;
hipblaslt_device_memory_type memTypeB = arg.uncachedB? HIPBLASLT_DEVICE_MEMORY_UNCACHED : memType;
hipblaslt_device_memory_type memTypeC = arg.uncachedC? HIPBLASLT_DEVICE_MEMORY_UNCACHED : memType;
hipblaslt_device_memory_type memTypeD = arg.uncachedD? HIPBLASLT_DEVICE_MEMORY_UNCACHED : memType;
hipblaslt_local_handle handle{arg};
hipStream_t stream;
CHECK_HIP_ERROR(hipStreamCreate(&stream));
Expand Down Expand Up @@ -976,15 +980,15 @@ void testing_matmul_with_bias(const Arguments& arg)
}

// allocate memory on device
dA[i] = new device_vector<TiA>(size_A[i] * block_count, 1, HMM);
dB[i] = new device_vector<TiB>(size_B[i] * block_count, 1, HMM);
dC[i] = new device_vector<To>(size_C[i] * block_count, 1, HMM);
dA[i] = new device_vector<TiA>(size_A[i] * block_count, 1, memTypeA);
dB[i] = new device_vector<TiB>(size_B[i] * block_count, 1, memTypeB);
dC[i] = new device_vector<To>(size_C[i] * block_count, 1, memTypeC);
if(!arg.c_equal_d)
dD[i] = new device_vector<To>(size_D[i] * block_count, 1, HMM);
dD[i] = new device_vector<To>(size_D[i] * block_count, 1, memTypeD);
else
dD[i] = dC[i];
dBias[i] = new device_vector<Tbias>(size_bias[i] * block_count, 1, HMM);
dScaleAlphaVec[i] = new device_vector<Talpha>(size_scaleAlphaVec[i] * block_count, 1, HMM);
dBias[i] = new device_vector<Tbias>(size_bias[i] * block_count, 1, memType);
dScaleAlphaVec[i] = new device_vector<Talpha>(size_scaleAlphaVec[i] * block_count, 1, memType);

CHECK_DEVICE_ALLOCATION(dA[i]->memcheck());
CHECK_DEVICE_ALLOCATION(dB[i]->memcheck());
Expand All @@ -995,7 +999,7 @@ void testing_matmul_with_bias(const Arguments& arg)
CHECK_DEVICE_ALLOCATION(dScaleAlphaVec[i]->memcheck());
if(arg.use_e)
{
dE[i] = new device_vector<To>(size_E[i] * block_count, 1, HMM);
dE[i] = new device_vector<To>(size_E[i] * block_count, 1, memType);
CHECK_DEVICE_ALLOCATION(dE[i]->memcheck());
}
else
Expand All @@ -1005,33 +1009,33 @@ void testing_matmul_with_bias(const Arguments& arg)

if(arg.scaleA)
{
dScaleA[i] = new device_vector<Talpha>(size_scaleAVec[i] * block_count, 1, HMM);
dScaleA[i] = new device_vector<Talpha>(size_scaleAVec[i] * block_count, 1, memType);
CHECK_DEVICE_ALLOCATION(dScaleA[i]->memcheck());
}
if(arg.scaleB)
{
dScaleB[i] = new device_vector<Talpha>(size_scaleBVec[i] * block_count, 1, HMM);
dScaleB[i] = new device_vector<Talpha>(size_scaleBVec[i] * block_count, 1, memType);
CHECK_DEVICE_ALLOCATION(dScaleB[i]->memcheck());
}
if(arg.scaleC)
{
dScaleC[i] = new device_vector<Talpha>(1, 1, HMM);
dScaleC[i] = new device_vector<Talpha>(1, 1, memType);
CHECK_DEVICE_ALLOCATION(dScaleC[i]->memcheck());
}
if(arg.scaleD)
{
dScaleD[i] = new device_vector<Talpha>(1, 1, HMM);
dScaleD[i] = new device_vector<Talpha>(1, 1, memType);
CHECK_DEVICE_ALLOCATION(dScaleD[i]->memcheck());
}
if(arg.amaxD)
{
epilogue_on[i] = true;
dAmaxD[i] = new device_vector<Talpha>(1, 1, HMM);
dAmaxD[i] = new device_vector<Talpha>(1, 1, memType);
CHECK_DEVICE_ALLOCATION(dAmaxD[i]->memcheck());
}
if(arg.scaleE)
{
dScaleE[i] = new device_vector<Talpha>(1, 1, HMM);
dScaleE[i] = new device_vector<Talpha>(1, 1, memType);
CHECK_DEVICE_ALLOCATION(dScaleE[i]->memcheck());
}

Expand Down Expand Up @@ -2212,7 +2216,7 @@ void testing_matmul_with_bias(const Arguments& arg)

CHECK_SOLUTION_FOUND(returnedAlgoCount);

dWorkspace = new device_vector<unsigned char>(workspace_size * block_count, 1, HMM);
dWorkspace = new device_vector<unsigned char>(workspace_size * block_count, 1, memType);
CHECK_DEVICE_ALLOCATION(dWorkspace->memcheck());

if(arg.use_user_args)
Expand Down
4 changes: 4 additions & 0 deletions docs/clients.rst
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,10 @@ For more information, see command:
--print_kernel_info Print solution, kernel name and solution index.
--rotating <value> Use rotating memory blocks for each iteration, size in MB. (Default value is: 0)
--use_gpu_timer Use hipEventElapsedTime to profile elapsed time. (Default value is: false)
--uncachedA Allocate un-cached memory for tensor A. (Default value is: false)
--uncachedB Allocate un-cached memory for tensor B. (Default value is: false)
--uncachedC Allocate un-cached memory for tensor C. (Default value is: false)
--uncachedD Allocate un-cached memory for tensor D. (Default value is: false)
--splitk <value> [Tuning parameter] Set split K for a solution, 0 is use solution's default value. (Only support GEMM + api_method mix or cpp)
--wgm <value> [Tuning parameter] Set workgroup mapping for a solution, 0 is use solution's default value. (Only support GEMM + api_method mix or cpp)
--help |-h produces this help message
Expand Down