ROCm · boringmorning · Sep 5, 2024
@@ -70,6 +70,10 @@ cd hipBLASLt; cd build/release
 --print_kernel_info        Print solution, kernel name and solution index.
 --rotating <value>         Use rotating memory blocks for each iteration, size in MB.                          (Default value is: 0)
 --use_gpu_timer            Use hipEventElapsedTime to profile elapsed time.                                    (Default value is: false)
+--uncachedA                Allocate un-cached memory for tensor A.                                             (Default value is: false)
+--uncachedB                Allocate un-cached memory for tensor B.                                             (Default value is: false)
+--uncachedC                Allocate un-cached memory for tensor C.                                             (Default value is: false)
+--uncachedD                Allocate un-cached memory for tensor D.                                             (Default value is: false)
 --splitk <value>           [Tuning parameter] Set split K for a solution, 0 is use solution's default value. (Only support GEMM + api_method mix or cpp)
 --wgm <value>              [Tuning parameter] Set workgroup mapping for a solution, 0 is use solution's default value. (Only support GEMM + api_method mix or cpp)
 --flush                    Flush icache

@@ -552,6 +552,22 @@ try
          value<bool>(&arg.use_gpu_timer)->default_value(false),
          "Use hipEventElapsedTime to profile elapsed time.")
 
+        ("uncachedA",
+         value<bool>(&arg.uncachedA)->default_value(false),
+         "Allocate un-cached memory for tensor A.")
+
+        ("uncachedB",
+         value<bool>(&arg.uncachedB)->default_value(false),
+         "Allocate un-cached memory for tensor B.")
+
+        ("uncachedC",
+         value<bool>(&arg.uncachedC)->default_value(false),
+         "Allocate un-cached memory for tensor C.")
+
+        ("uncachedD",
+         value<bool>(&arg.uncachedD)->default_value(false),
+         "Allocate un-cached memory for tensor D.")
+
         ("splitk",
          valueVec<uint32_t>(&gsu_vector),
          "[Tuning parameter] Set split K for a solution, 0 is use solution's default value. (Only support GEMM + api_method mix or cpp)")

@@ -141,6 +141,10 @@ void Arguments::init()
     use_user_args      = false;
     rotating           = 0;
     use_gpu_timer      = false;
+    uncachedA          = false;
+    uncachedB          = false;
+    uncachedC          = false;
+    uncachedD          = false;
 
     // tuning
     gsu_vector[0] = 0;

@@ -36,6 +36,23 @@
 #define MEM_MAX_GUARD_PAD 8192
 
 /* ============================================================================================ */
+
+//!
+//! @brief enum to specify the type of allocated device memory.
+//!
+typedef enum
+{
+    // Normal
+    HIPBLASLT_DEVICE_MEMORY_NORMAL,
+
+    // Managed
+    HIPBLASLT_DEVICE_MEMORY_MANAGED,
+
+    // Un-cached
+    HIPBLASLT_DEVICE_MEMORY_UNCACHED
+
+} hipblaslt_device_memory_type;
+
 /*! \brief  base-class to allocate/deallocate device memory */
 template <typename T>
 class d_vector
@@ -54,18 +71,18 @@ class d_vector
     }
 
 public:
-    bool use_HMM = false;
+    hipblaslt_device_memory_type m_type = HIPBLASLT_DEVICE_MEMORY_NORMAL;
 
 public:
     static T m_guard[MEM_MAX_GUARD_PAD];
 
 #ifdef GOOGLE_TEST
-    d_vector(size_t s, bool HMM = false)
+    d_vector(size_t s, hipblaslt_device_memory_type memType = HIPBLASLT_DEVICE_MEMORY_NORMAL)
         : m_size(s)
         , m_pad(std::min(g_DVEC_PAD, size_t(MEM_MAX_GUARD_PAD)))
         , m_guard_len(m_pad * sizeof(T))
         , m_bytes((s + m_pad * 2) * sizeof(T))
-        , use_HMM(HMM)
+        , m_type(memType)
     {
         // Initialize m_guard with random data
         if(!m_init_guard)
@@ -75,25 +92,40 @@ class d_vector
         }
     }
 #else
-    d_vector(size_t s, bool HMM = false)
+    d_vector(size_t s, hipblaslt_device_memory_type memType = HIPBLASLT_DEVICE_MEMORY_NORMAL)
         : m_size(s)
         , m_pad(0) // save current pad length
         , m_guard_len(0 * sizeof(T))
         , m_bytes(s ? s * sizeof(T) : sizeof(T))
-        , use_HMM(HMM)
+        , m_type(memType)
     {
     }
 #endif
 
     T* device_vector_setup()
     {
         T* d = nullptr;
-        if(use_HMM ? hipMallocManaged(&d, m_bytes) : (hipMalloc)(&d, m_bytes) != hipSuccess)
-        {
-            hipblaslt_cerr << "Error allocating " << m_bytes << " m_bytes (" << (m_bytes >> 30)
-                           << " GB)" << std::endl;
-
-            d = nullptr;
+        switch(m_type){
+            case HIPBLASLT_DEVICE_MEMORY_NORMAL:
+                if((hipMalloc)(&d, m_bytes) != hipSuccess)
+                    d = nullptr;
+                break;
+            case HIPBLASLT_DEVICE_MEMORY_MANAGED:
+                if(hipMallocManaged(&d, m_bytes) != hipSuccess)
+                    d = nullptr;
+                break;
+            case HIPBLASLT_DEVICE_MEMORY_UNCACHED:
+                if(hipExtMallocWithFlags((void**)&d, m_bytes, hipDeviceMallocUncached) != hipSuccess)
+                    d = nullptr;
+                break;
+            default:
+                hipblaslt_cerr << "Wrong device memory type: " << m_type << std::endl;
+                return nullptr;
+                break;
+        }
+        if(d == nullptr){
+            hipblaslt_cerr << "Error allocating un-cached memory " << m_bytes << " m_bytes (" << (m_bytes >> 30)
+                            << " GB)" << std::endl;
         }
 #ifdef GOOGLE_TEST
         else

@@ -56,10 +56,9 @@ class device_vector : public d_vector<T>
     //! @brief Constructor.
     //! @param n The length of the vector.
     //! @param inc The increment.
-    //! @param HMM         HipManagedMemory Flag.
-    //!
-    explicit device_vector(size_t n, int64_t inc = 1, bool HMM = false)
-        : d_vector<T>{n * std::abs(inc), HMM}
+    //! @param memType Memory type enum.
+    explicit device_vector(size_t n, int64_t inc = 1, hipblaslt_device_memory_type memType = HIPBLASLT_DEVICE_MEMORY_NORMAL)
+        : d_vector<T>{n * std::abs(inc), memType}
         , m_n{n}
         , m_inc{inc}
         , m_data{this->device_vector_setup()}
@@ -136,7 +135,7 @@ class device_vector : public d_vector<T>
                 = hipMemcpy(m_data + i_block * this->nmemb() / block_count,
                             (const T*)that,
                             this->nmemb() * sizeof(T) / block_count,
-                            this->use_HMM ? hipMemcpyHostToHost : hipMemcpyHostToDevice);
+                            this->m_type == HIPBLASLT_DEVICE_MEMORY_MANAGED ? hipMemcpyHostToHost : hipMemcpyHostToDevice);
             if(status != hipSuccess)
             {
                 return status;

@@ -163,6 +163,10 @@ struct Arguments
     bool    use_user_args;
     int32_t rotating;
     bool    use_gpu_timer;
+    bool    uncachedA;
+    bool    uncachedB;
+    bool    uncachedC;
+    bool    uncachedD;
 
     // tuning
     int32_t gsu_vector[MAX_SUPPORTED_NUM_PROBLEMS]; // This is for client
@@ -260,6 +264,10 @@ struct Arguments
     OPER(use_user_args) SEP          \
     OPER(rotating) SEP               \
     OPER(use_gpu_timer) SEP          \
+    OPER(uncachedA) SEP              \
+    OPER(uncachedB) SEP              \
+    OPER(uncachedC) SEP              \
+    OPER(uncachedD) SEP              \
     OPER(gsu_vector) SEP             \
     OPER(wgm_vector) SEP             \
     OPER(print_solution_found) SEP   \

@@ -287,6 +287,10 @@ Arguments:
   - use_user_args: c_bool
   - rotating: c_int32
   - use_gpu_timer: c_bool
+  - uncachedA: c_bool
+  - uncachedB: c_bool
+  - uncachedC: c_bool
+  - uncachedD: c_bool
   - gsu_vector: c_int32*32
   - wgm_vector: c_int32*32
   - print_solution_found: c_bool
@@ -386,6 +390,10 @@ Defaults:
   use_user_args: false
   rotating: 0
   use_gpu_timer: false
+  uncachedA: false
+  uncachedB: false
+  uncachedC: false
+  uncachedD: false
   gsu_vector: 0
   wgm_vector: 0
   print_solution_found: false

@@ -89,13 +89,13 @@ struct host_vector : std::vector<T>
     {
         hipError_t hip_err;
 
-        if(that.use_HMM && hipSuccess != (hip_err = hipDeviceSynchronize()))
+        if(that.m_type == HIPBLASLT_DEVICE_MEMORY_MANAGED && hipSuccess != (hip_err = hipDeviceSynchronize()))
             return hip_err;
 
         return hipMemcpy(*this,
                          that,
                          sizeof(T) * this->size(),
-                         that.use_HMM ? hipMemcpyHostToHost : hipMemcpyDeviceToHost);
+                         that.m_type == HIPBLASLT_DEVICE_MEMORY_MANAGED ? hipMemcpyHostToHost : hipMemcpyDeviceToHost);
     }
 
     //!

@@ -665,7 +665,11 @@ void testing_matmul_with_bias(const Arguments& arg)
 {
     double gpu_time_used, cpu_time_used;
     gpu_time_used = cpu_time_used = 0.0;
-    bool                   HMM    = arg.HMM;
+    hipblaslt_device_memory_type memType = arg.HMM? HIPBLASLT_DEVICE_MEMORY_MANAGED : HIPBLASLT_DEVICE_MEMORY_NORMAL;
+    hipblaslt_device_memory_type memTypeA = arg.uncachedA? HIPBLASLT_DEVICE_MEMORY_UNCACHED : memType;
+    hipblaslt_device_memory_type memTypeB = arg.uncachedB? HIPBLASLT_DEVICE_MEMORY_UNCACHED : memType;
+    hipblaslt_device_memory_type memTypeC = arg.uncachedC? HIPBLASLT_DEVICE_MEMORY_UNCACHED : memType;
+    hipblaslt_device_memory_type memTypeD = arg.uncachedD? HIPBLASLT_DEVICE_MEMORY_UNCACHED : memType;
     hipblaslt_local_handle handle{arg};
     hipStream_t            stream;
     CHECK_HIP_ERROR(hipStreamCreate(&stream));
@@ -976,15 +980,15 @@ void testing_matmul_with_bias(const Arguments& arg)
         }
 
         // allocate memory on device
-        dA[i] = new device_vector<TiA>(size_A[i] * block_count, 1, HMM);
-        dB[i] = new device_vector<TiB>(size_B[i] * block_count, 1, HMM);
-        dC[i] = new device_vector<To>(size_C[i] * block_count, 1, HMM);
+        dA[i] = new device_vector<TiA>(size_A[i] * block_count, 1, memTypeA);
+        dB[i] = new device_vector<TiB>(size_B[i] * block_count, 1, memTypeB);
+        dC[i] = new device_vector<To>(size_C[i] * block_count, 1, memTypeC);
         if(!arg.c_equal_d)
-            dD[i] = new device_vector<To>(size_D[i] * block_count, 1, HMM);
+            dD[i] = new device_vector<To>(size_D[i] * block_count, 1, memTypeD);
         else
             dD[i] = dC[i];
-        dBias[i]          = new device_vector<Tbias>(size_bias[i] * block_count, 1, HMM);
-        dScaleAlphaVec[i] = new device_vector<Talpha>(size_scaleAlphaVec[i] * block_count, 1, HMM);
+        dBias[i]          = new device_vector<Tbias>(size_bias[i] * block_count, 1, memType);
+        dScaleAlphaVec[i] = new device_vector<Talpha>(size_scaleAlphaVec[i] * block_count, 1, memType);
 
         CHECK_DEVICE_ALLOCATION(dA[i]->memcheck());
         CHECK_DEVICE_ALLOCATION(dB[i]->memcheck());
@@ -995,7 +999,7 @@ void testing_matmul_with_bias(const Arguments& arg)
         CHECK_DEVICE_ALLOCATION(dScaleAlphaVec[i]->memcheck());
         if(arg.use_e)
         {
-            dE[i] = new device_vector<To>(size_E[i] * block_count, 1, HMM);
+            dE[i] = new device_vector<To>(size_E[i] * block_count, 1, memType);
             CHECK_DEVICE_ALLOCATION(dE[i]->memcheck());
         }
         else
@@ -1005,33 +1009,33 @@ void testing_matmul_with_bias(const Arguments& arg)
 
         if(arg.scaleA)
         {
-            dScaleA[i] = new device_vector<Talpha>(size_scaleAVec[i] * block_count, 1, HMM);
+            dScaleA[i] = new device_vector<Talpha>(size_scaleAVec[i] * block_count, 1, memType);
             CHECK_DEVICE_ALLOCATION(dScaleA[i]->memcheck());
         }
         if(arg.scaleB)
         {
-            dScaleB[i] = new device_vector<Talpha>(size_scaleBVec[i] * block_count, 1, HMM);
+            dScaleB[i] = new device_vector<Talpha>(size_scaleBVec[i] * block_count, 1, memType);
             CHECK_DEVICE_ALLOCATION(dScaleB[i]->memcheck());
         }
         if(arg.scaleC)
         {
-            dScaleC[i] = new device_vector<Talpha>(1, 1, HMM);
+            dScaleC[i] = new device_vector<Talpha>(1, 1, memType);
             CHECK_DEVICE_ALLOCATION(dScaleC[i]->memcheck());
         }
         if(arg.scaleD)
         {
-            dScaleD[i] = new device_vector<Talpha>(1, 1, HMM);
+            dScaleD[i] = new device_vector<Talpha>(1, 1, memType);
             CHECK_DEVICE_ALLOCATION(dScaleD[i]->memcheck());
         }
         if(arg.amaxD)
         {
             epilogue_on[i] = true;
-            dAmaxD[i]      = new device_vector<Talpha>(1, 1, HMM);
+            dAmaxD[i]      = new device_vector<Talpha>(1, 1, memType);
             CHECK_DEVICE_ALLOCATION(dAmaxD[i]->memcheck());
         }
         if(arg.scaleE)
         {
-            dScaleE[i] = new device_vector<Talpha>(1, 1, HMM);
+            dScaleE[i] = new device_vector<Talpha>(1, 1, memType);
             CHECK_DEVICE_ALLOCATION(dScaleE[i]->memcheck());
         }
 
@@ -2212,7 +2216,7 @@ void testing_matmul_with_bias(const Arguments& arg)
 
     CHECK_SOLUTION_FOUND(returnedAlgoCount);
 
-    dWorkspace = new device_vector<unsigned char>(workspace_size * block_count, 1, HMM);
+    dWorkspace = new device_vector<unsigned char>(workspace_size * block_count, 1, memType);
     CHECK_DEVICE_ALLOCATION(dWorkspace->memcheck());
 
     if(arg.use_user_args)

@@ -120,6 +120,10 @@ For more information, see command:
    --print_kernel_info        Print solution, kernel name and solution index.
    --rotating <value>         Use rotating memory blocks for each iteration, size in MB.                          (Default value is: 0)
    --use_gpu_timer            Use hipEventElapsedTime to profile elapsed time.                                    (Default value is: false)
+   --uncachedA                Allocate un-cached memory for tensor A.                                             (Default value is: false)
+   --uncachedB                Allocate un-cached memory for tensor B.                                             (Default value is: false)
+   --uncachedC                Allocate un-cached memory for tensor C.                                             (Default value is: false)
+   --uncachedD                Allocate un-cached memory for tensor D.                                             (Default value is: false)
    --splitk <value>           [Tuning parameter] Set split K for a solution, 0 is use solution's default value. (Only support GEMM + api_method mix or cpp)
    --wgm <value>              [Tuning parameter] Set workgroup mapping for a solution, 0 is use solution's default value. (Only support GEMM + api_method mix or cpp)
    --help |-h                 produces this help message