diff --git a/clients/benchmarks/README.md b/clients/benchmarks/README.md index 1aeff111bd..81b5780da0 100644 --- a/clients/benchmarks/README.md +++ b/clients/benchmarks/README.md @@ -70,6 +70,10 @@ cd hipBLASLt; cd build/release --print_kernel_info Print solution, kernel name and solution index. --rotating Use rotating memory blocks for each iteration, size in MB. (Default value is: 0) --use_gpu_timer Use hipEventElapsedTime to profile elapsed time. (Default value is: false) +--uncachedA Allocate un-cached memory for tensor A. (Default value is: false) +--uncachedB Allocate un-cached memory for tensor B. (Default value is: false) +--uncachedC Allocate un-cached memory for tensor C. (Default value is: false) +--uncachedD Allocate un-cached memory for tensor D. (Default value is: false) --splitk [Tuning parameter] Set split K for a solution, 0 is use solution's default value. (Only support GEMM + api_method mix or cpp) --wgm [Tuning parameter] Set workgroup mapping for a solution, 0 is use solution's default value. (Only support GEMM + api_method mix or cpp) --flush Flush icache diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp index b87924fee8..25638366d6 100644 --- a/clients/benchmarks/client.cpp +++ b/clients/benchmarks/client.cpp @@ -552,6 +552,22 @@ try value(&arg.use_gpu_timer)->default_value(false), "Use hipEventElapsedTime to profile elapsed time.") + ("uncachedA", + value(&arg.uncachedA)->default_value(false), + "Allocate un-cached memory for tensor A.") + + ("uncachedB", + value(&arg.uncachedB)->default_value(false), + "Allocate un-cached memory for tensor B.") + + ("uncachedC", + value(&arg.uncachedC)->default_value(false), + "Allocate un-cached memory for tensor C.") + + ("uncachedD", + value(&arg.uncachedD)->default_value(false), + "Allocate un-cached memory for tensor D.") + ("splitk", valueVec(&gsu_vector), "[Tuning parameter] Set split K for a solution, 0 is use solution's default value. (Only support GEMM + api_method mix or cpp)") diff --git a/clients/common/hipblaslt_arguments.cpp b/clients/common/hipblaslt_arguments.cpp index ce411ad02a..8949821a0a 100644 --- a/clients/common/hipblaslt_arguments.cpp +++ b/clients/common/hipblaslt_arguments.cpp @@ -141,6 +141,10 @@ void Arguments::init() use_user_args = false; rotating = 0; use_gpu_timer = false; + uncachedA = false; + uncachedB = false; + uncachedC = false; + uncachedD = false; // tuning gsu_vector[0] = 0; diff --git a/clients/include/d_vector.hpp b/clients/include/d_vector.hpp index e47152e17b..67c1ec85cc 100644 --- a/clients/include/d_vector.hpp +++ b/clients/include/d_vector.hpp @@ -36,6 +36,23 @@ #define MEM_MAX_GUARD_PAD 8192 /* ============================================================================================ */ + +//! +//! @brief enum to specify the type of allocated device memory. +//! +typedef enum +{ + // Normal + HIPBLASLT_DEVICE_MEMORY_NORMAL, + + // Managed + HIPBLASLT_DEVICE_MEMORY_MANAGED, + + // Un-cached + HIPBLASLT_DEVICE_MEMORY_UNCACHED + +} hipblaslt_device_memory_type; + /*! \brief base-class to allocate/deallocate device memory */ template class d_vector @@ -54,18 +71,18 @@ class d_vector } public: - bool use_HMM = false; + hipblaslt_device_memory_type m_type = HIPBLASLT_DEVICE_MEMORY_NORMAL; public: static T m_guard[MEM_MAX_GUARD_PAD]; #ifdef GOOGLE_TEST - d_vector(size_t s, bool HMM = false) + d_vector(size_t s, hipblaslt_device_memory_type memType = HIPBLASLT_DEVICE_MEMORY_NORMAL) : m_size(s) , m_pad(std::min(g_DVEC_PAD, size_t(MEM_MAX_GUARD_PAD))) , m_guard_len(m_pad * sizeof(T)) , m_bytes((s + m_pad * 2) * sizeof(T)) - , use_HMM(HMM) + , m_type(memType) { // Initialize m_guard with random data if(!m_init_guard) @@ -75,12 +92,12 @@ class d_vector } } #else - d_vector(size_t s, bool HMM = false) + d_vector(size_t s, hipblaslt_device_memory_type memType = HIPBLASLT_DEVICE_MEMORY_NORMAL) : m_size(s) , m_pad(0) // save current pad length , m_guard_len(0 * sizeof(T)) , m_bytes(s ? s * sizeof(T) : sizeof(T)) - , use_HMM(HMM) + , m_type(memType) { } #endif @@ -88,12 +105,27 @@ class d_vector T* device_vector_setup() { T* d = nullptr; - if(use_HMM ? hipMallocManaged(&d, m_bytes) : (hipMalloc)(&d, m_bytes) != hipSuccess) - { - hipblaslt_cerr << "Error allocating " << m_bytes << " m_bytes (" << (m_bytes >> 30) - << " GB)" << std::endl; - - d = nullptr; + switch(m_type){ + case HIPBLASLT_DEVICE_MEMORY_NORMAL: + if((hipMalloc)(&d, m_bytes) != hipSuccess) + d = nullptr; + break; + case HIPBLASLT_DEVICE_MEMORY_MANAGED: + if(hipMallocManaged(&d, m_bytes) != hipSuccess) + d = nullptr; + break; + case HIPBLASLT_DEVICE_MEMORY_UNCACHED: + if(hipExtMallocWithFlags((void**)&d, m_bytes, hipDeviceMallocUncached) != hipSuccess) + d = nullptr; + break; + default: + hipblaslt_cerr << "Wrong device memory type: " << m_type << std::endl; + return nullptr; + break; + } + if(d == nullptr){ + hipblaslt_cerr << "Error allocating un-cached memory " << m_bytes << " m_bytes (" << (m_bytes >> 30) + << " GB)" << std::endl; } #ifdef GOOGLE_TEST else diff --git a/clients/include/device_vector.hpp b/clients/include/device_vector.hpp index 5d21a5cdbe..02a83a8e3c 100644 --- a/clients/include/device_vector.hpp +++ b/clients/include/device_vector.hpp @@ -56,10 +56,9 @@ class device_vector : public d_vector //! @brief Constructor. //! @param n The length of the vector. //! @param inc The increment. - //! @param HMM HipManagedMemory Flag. - //! - explicit device_vector(size_t n, int64_t inc = 1, bool HMM = false) - : d_vector{n * std::abs(inc), HMM} + //! @param memType Memory type enum. + explicit device_vector(size_t n, int64_t inc = 1, hipblaslt_device_memory_type memType = HIPBLASLT_DEVICE_MEMORY_NORMAL) + : d_vector{n * std::abs(inc), memType} , m_n{n} , m_inc{inc} , m_data{this->device_vector_setup()} @@ -136,7 +135,7 @@ class device_vector : public d_vector = hipMemcpy(m_data + i_block * this->nmemb() / block_count, (const T*)that, this->nmemb() * sizeof(T) / block_count, - this->use_HMM ? hipMemcpyHostToHost : hipMemcpyHostToDevice); + this->m_type == HIPBLASLT_DEVICE_MEMORY_MANAGED ? hipMemcpyHostToHost : hipMemcpyHostToDevice); if(status != hipSuccess) { return status; diff --git a/clients/include/hipblaslt_arguments.hpp b/clients/include/hipblaslt_arguments.hpp index 8f3ba84d10..0b8a6fd0cc 100644 --- a/clients/include/hipblaslt_arguments.hpp +++ b/clients/include/hipblaslt_arguments.hpp @@ -163,6 +163,10 @@ struct Arguments bool use_user_args; int32_t rotating; bool use_gpu_timer; + bool uncachedA; + bool uncachedB; + bool uncachedC; + bool uncachedD; // tuning int32_t gsu_vector[MAX_SUPPORTED_NUM_PROBLEMS]; // This is for client @@ -260,6 +264,10 @@ struct Arguments OPER(use_user_args) SEP \ OPER(rotating) SEP \ OPER(use_gpu_timer) SEP \ + OPER(uncachedA) SEP \ + OPER(uncachedB) SEP \ + OPER(uncachedC) SEP \ + OPER(uncachedD) SEP \ OPER(gsu_vector) SEP \ OPER(wgm_vector) SEP \ OPER(print_solution_found) SEP \ diff --git a/clients/include/hipblaslt_common.yaml b/clients/include/hipblaslt_common.yaml index a29dba19a8..9b63a798a8 100755 --- a/clients/include/hipblaslt_common.yaml +++ b/clients/include/hipblaslt_common.yaml @@ -287,6 +287,10 @@ Arguments: - use_user_args: c_bool - rotating: c_int32 - use_gpu_timer: c_bool + - uncachedA: c_bool + - uncachedB: c_bool + - uncachedC: c_bool + - uncachedD: c_bool - gsu_vector: c_int32*32 - wgm_vector: c_int32*32 - print_solution_found: c_bool @@ -386,6 +390,10 @@ Defaults: use_user_args: false rotating: 0 use_gpu_timer: false + uncachedA: false + uncachedB: false + uncachedC: false + uncachedD: false gsu_vector: 0 wgm_vector: 0 print_solution_found: false diff --git a/clients/include/host_vector.hpp b/clients/include/host_vector.hpp index 1130b8f009..51c178ba6d 100644 --- a/clients/include/host_vector.hpp +++ b/clients/include/host_vector.hpp @@ -89,13 +89,13 @@ struct host_vector : std::vector { hipError_t hip_err; - if(that.use_HMM && hipSuccess != (hip_err = hipDeviceSynchronize())) + if(that.m_type == HIPBLASLT_DEVICE_MEMORY_MANAGED && hipSuccess != (hip_err = hipDeviceSynchronize())) return hip_err; return hipMemcpy(*this, that, sizeof(T) * this->size(), - that.use_HMM ? hipMemcpyHostToHost : hipMemcpyDeviceToHost); + that.m_type == HIPBLASLT_DEVICE_MEMORY_MANAGED ? hipMemcpyHostToHost : hipMemcpyDeviceToHost); } //! diff --git a/clients/include/testing_matmul.hpp b/clients/include/testing_matmul.hpp index 7d96a08751..145f6b662e 100644 --- a/clients/include/testing_matmul.hpp +++ b/clients/include/testing_matmul.hpp @@ -665,7 +665,11 @@ void testing_matmul_with_bias(const Arguments& arg) { double gpu_time_used, cpu_time_used; gpu_time_used = cpu_time_used = 0.0; - bool HMM = arg.HMM; + hipblaslt_device_memory_type memType = arg.HMM? HIPBLASLT_DEVICE_MEMORY_MANAGED : HIPBLASLT_DEVICE_MEMORY_NORMAL; + hipblaslt_device_memory_type memTypeA = arg.uncachedA? HIPBLASLT_DEVICE_MEMORY_UNCACHED : memType; + hipblaslt_device_memory_type memTypeB = arg.uncachedB? HIPBLASLT_DEVICE_MEMORY_UNCACHED : memType; + hipblaslt_device_memory_type memTypeC = arg.uncachedC? HIPBLASLT_DEVICE_MEMORY_UNCACHED : memType; + hipblaslt_device_memory_type memTypeD = arg.uncachedD? HIPBLASLT_DEVICE_MEMORY_UNCACHED : memType; hipblaslt_local_handle handle{arg}; hipStream_t stream; CHECK_HIP_ERROR(hipStreamCreate(&stream)); @@ -976,15 +980,15 @@ void testing_matmul_with_bias(const Arguments& arg) } // allocate memory on device - dA[i] = new device_vector(size_A[i] * block_count, 1, HMM); - dB[i] = new device_vector(size_B[i] * block_count, 1, HMM); - dC[i] = new device_vector(size_C[i] * block_count, 1, HMM); + dA[i] = new device_vector(size_A[i] * block_count, 1, memTypeA); + dB[i] = new device_vector(size_B[i] * block_count, 1, memTypeB); + dC[i] = new device_vector(size_C[i] * block_count, 1, memTypeC); if(!arg.c_equal_d) - dD[i] = new device_vector(size_D[i] * block_count, 1, HMM); + dD[i] = new device_vector(size_D[i] * block_count, 1, memTypeD); else dD[i] = dC[i]; - dBias[i] = new device_vector(size_bias[i] * block_count, 1, HMM); - dScaleAlphaVec[i] = new device_vector(size_scaleAlphaVec[i] * block_count, 1, HMM); + dBias[i] = new device_vector(size_bias[i] * block_count, 1, memType); + dScaleAlphaVec[i] = new device_vector(size_scaleAlphaVec[i] * block_count, 1, memType); CHECK_DEVICE_ALLOCATION(dA[i]->memcheck()); CHECK_DEVICE_ALLOCATION(dB[i]->memcheck()); @@ -995,7 +999,7 @@ void testing_matmul_with_bias(const Arguments& arg) CHECK_DEVICE_ALLOCATION(dScaleAlphaVec[i]->memcheck()); if(arg.use_e) { - dE[i] = new device_vector(size_E[i] * block_count, 1, HMM); + dE[i] = new device_vector(size_E[i] * block_count, 1, memType); CHECK_DEVICE_ALLOCATION(dE[i]->memcheck()); } else @@ -1005,33 +1009,33 @@ void testing_matmul_with_bias(const Arguments& arg) if(arg.scaleA) { - dScaleA[i] = new device_vector(size_scaleAVec[i] * block_count, 1, HMM); + dScaleA[i] = new device_vector(size_scaleAVec[i] * block_count, 1, memType); CHECK_DEVICE_ALLOCATION(dScaleA[i]->memcheck()); } if(arg.scaleB) { - dScaleB[i] = new device_vector(size_scaleBVec[i] * block_count, 1, HMM); + dScaleB[i] = new device_vector(size_scaleBVec[i] * block_count, 1, memType); CHECK_DEVICE_ALLOCATION(dScaleB[i]->memcheck()); } if(arg.scaleC) { - dScaleC[i] = new device_vector(1, 1, HMM); + dScaleC[i] = new device_vector(1, 1, memType); CHECK_DEVICE_ALLOCATION(dScaleC[i]->memcheck()); } if(arg.scaleD) { - dScaleD[i] = new device_vector(1, 1, HMM); + dScaleD[i] = new device_vector(1, 1, memType); CHECK_DEVICE_ALLOCATION(dScaleD[i]->memcheck()); } if(arg.amaxD) { epilogue_on[i] = true; - dAmaxD[i] = new device_vector(1, 1, HMM); + dAmaxD[i] = new device_vector(1, 1, memType); CHECK_DEVICE_ALLOCATION(dAmaxD[i]->memcheck()); } if(arg.scaleE) { - dScaleE[i] = new device_vector(1, 1, HMM); + dScaleE[i] = new device_vector(1, 1, memType); CHECK_DEVICE_ALLOCATION(dScaleE[i]->memcheck()); } @@ -2212,7 +2216,7 @@ void testing_matmul_with_bias(const Arguments& arg) CHECK_SOLUTION_FOUND(returnedAlgoCount); - dWorkspace = new device_vector(workspace_size * block_count, 1, HMM); + dWorkspace = new device_vector(workspace_size * block_count, 1, memType); CHECK_DEVICE_ALLOCATION(dWorkspace->memcheck()); if(arg.use_user_args) diff --git a/docs/clients.rst b/docs/clients.rst index c5af0e43db..72e9d18695 100644 --- a/docs/clients.rst +++ b/docs/clients.rst @@ -120,6 +120,10 @@ For more information, see command: --print_kernel_info Print solution, kernel name and solution index. --rotating Use rotating memory blocks for each iteration, size in MB. (Default value is: 0) --use_gpu_timer Use hipEventElapsedTime to profile elapsed time. (Default value is: false) + --uncachedA Allocate un-cached memory for tensor A. (Default value is: false) + --uncachedB Allocate un-cached memory for tensor B. (Default value is: false) + --uncachedC Allocate un-cached memory for tensor C. (Default value is: false) + --uncachedD Allocate un-cached memory for tensor D. (Default value is: false) --splitk [Tuning parameter] Set split K for a solution, 0 is use solution's default value. (Only support GEMM + api_method mix or cpp) --wgm [Tuning parameter] Set workgroup mapping for a solution, 0 is use solution's default value. (Only support GEMM + api_method mix or cpp) --help |-h produces this help message