ROCm · KKyang · Sep 27, 2023 · Sep 26, 2023
@@ -92,3 +92,53 @@ set_target_properties( hipblaslt-bench PROPERTIES
 add_dependencies( hipblaslt-bench hipblaslt-common )
 
 rocm_install(TARGETS hipblaslt-bench COMPONENT benchmarks)
+
+# Other bench files
+add_executable( hipblaslt-bench-groupedgemm-fixed-mk client_groupedgemm_fixed_mk.cpp)
+add_executable( hipblaslt-bench-extop-layernorm client_extop_layernorm.cpp)
+add_executable( hipblaslt-bench-extop-matrixtransform client_extop_matrixtransform.cpp)
+add_executable( hipblaslt-bench-extop-softmax client_extop_softmax.cpp)
+set(ext_bench_list_all hipblaslt-bench-groupedgemm-fixed-mk hipblaslt-bench-extop-layernorm hipblaslt-bench-extop-matrixtransform hipblaslt-bench-extop-softmax)
+
+foreach( exe ${ext_bench_list_all} )
+  target_link_libraries( ${exe} PRIVATE roc::hipblaslt )
+
+  set_target_properties( ${exe} PROPERTIES
+    CXX_STANDARD 17
+    CXX_STANDARD_REQUIRED ON
+    CXX_EXTENSIONS OFF
+    RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging"
+  )
+
+  target_compile_definitions( ${exe} PRIVATE ROCM_USE_FLOAT16 )
+
+  target_include_directories( ${exe}
+    PRIVATE
+      $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
+      $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/common>
+      $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../../library/include>
+      )
+
+  target_include_directories( ${exe}
+    SYSTEM PRIVATE
+      $<BUILD_INTERFACE:${HIP_INCLUDE_DIRS}>
+      )
+
+  if( CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    # GCC or hip-clang needs specific flags to turn on f16c intrinsics
+    target_compile_options( ${exe} PRIVATE -mf16c -Wno-unused-result )
+    target_compile_definitions( ${exe} PRIVATE ROCBLASLT_INTERNAL_API )
+  endif( )
+endforeach( )
+
+foreach( exe ${ext_bench_list_all} )
+  target_link_libraries( ${exe} PRIVATE hip::device )
+endforeach( )
+
+foreach( exe ${ext_bench_list_all} )
+  rocm_install(TARGETS ${exe} COMPONENT samples)
+endforeach( )
+
+# Add tests for special cases
+add_test(NAME hipblaslt-bench-groupedgemm-fixed-mk
+         COMMAND hipblaslt-bench-groupedgemm-fixed-mk --trans_a N --trans_b N --in_datatype fp16 --out_datatype fp16 -m 2048 -k 2048 -n 0 -n 1 -n 1 -n 0 -n 1 -n 0 -n 1 -n 0 -n 0 -n 0 -n 0 -n 0 -n 0 -n 0 -n 0 -n 0 -V)
@@ -0,0 +1,304 @@
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <hipblaslt/hipblaslt-ext-op.h>
+#include <hipblaslt/hipblaslt.h>
+#include <hipblaslt_init.hpp>
+#include <iostream>
+#include <numeric>
+#include <random>
+#include <type_traits>
+#include <vector>
+
+void printUsage(char* programName)
+{
+    std::cout << "Usage: " << programName << " <options>\n"
+              << "options:\n"
+              << "\t-h, --help\t\t\tShow this help message\n"
+              << "\t-m, --m\t\t\t\tSize of dim 0, default is 1335\n"
+              << "\t-n, --n\t\t\t\tSize of dim 1, default is 16\n"
+              << "\t-a, --affine\t\t\t\tEnable Gamma and Beta, default is false\n";
+}
+
+static std::random_device                    rd;
+static std::mt19937                          gen(rd());
+static std::uniform_real_distribution<float> dist(-1.f, 1.f);
+
+template <typename Iter>
+void randomize(Iter beg, Iter end)
+{
+    using ValueType = typename std::remove_pointer<Iter>::type;
+    for(auto i = beg; i != end; ++i)
+    {
+        *i = dist(gen);
+    }
+}
+
+void cpuLayerNorm(float*        out,
+                  float*        gpuMean,
+                  float*        gpuInvvar,
+                  float*        in,
+                  std::uint32_t batch,
+                  std::uint32_t length,
+                  float         eps   = 1e-05,
+                  float*        gamma = nullptr,
+                  float*        beta  = nullptr)
+{
+    // calculate gpuMean
+    for(int i = 0; i < batch; i++)
+    {
+        int    count = 0;
+        float* inC   = in + i * length;
+        float* outC  = out + i * length;
+
+        for(int j = 0; j < length; j++)
+        {
+            count        = count + 1;
+            float delta  = inC[j] - gpuMean[i];
+            gpuMean[i]   = gpuMean[i] + delta / count;
+            float delta2 = inC[j] - gpuMean[i];
+            gpuInvvar[i] = gpuInvvar[i] + delta * delta2;
+        }
+        gpuInvvar[i] = 1 / std::sqrt((gpuInvvar[i] / length) + eps);
+
+        // calculate gpuInvvar
+        for(int j = 0; j < length; j++)
+        {
+            outC[j] = (inC[j] - gpuMean[i]) * gpuInvvar[i];
+
+            if(gamma != nullptr)
+                outC[j] = outC[j] * gamma[j];
+
+            if(beta != nullptr)
+                outC[j] = outC[j] + beta[j];
+        }
+    }
+}
+
+int parseArgs(int argc, char** argv, size_t* m, size_t* n, bool* affine)
+{
+    if(argc <= 1)
+    {
+        return EXIT_SUCCESS;
+    }
+
+    for(int i = 1; i < argc; ++i)
+    {
+        std::string arg = argv[i];
+
+        if((arg.at(0) == '-') || ((arg.at(0) == '-') && (arg.at(1) == '-')))
+        {
+            if((arg == "-h") || (arg == "--help"))
+            {
+                return EXIT_FAILURE;
+            }
+
+            if(arg == "-m" || arg == "--m")
+            {
+                *m = std::stoul(argv[++i]);
+            }
+            else if(arg == "-n" || arg == "--n")
+            {
+                *n = std::stoul(argv[++i]);
+            }
+            else if(arg == "-a" || arg == "--affine")
+            {
+                *affine = std::stoul(argv[++i]);
+            }
+        }
+        else
+        {
+            std::cerr << "error with " << arg << std::endl;
+            std::cerr << "option must start with - or --" << std::endl << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    return EXIT_SUCCESS;
+}
+
+void dumpBuffer(const char* title, float* data, int M, int N)
+{
+    std::cout << "----- " << title << "----- " << std::endl;
+    for(int m = 0; m < M; m++)
+    {
+        for(int n = 0; n < N; n++)
+        {
+            std::cout << data[m * N + n] << " ";
+        }
+        std::cout << std::endl;
+    }
+    std::cout << std::endl;
+}
+
+template <typename T>
+void compare(const char* title, const std::vector<T>& cpuOutput, const std::vector<T>& refOutput)
+{
+    float maxErr = 0.0;
+    int   gpunan = 0;
+    int   cpunan = 0;
+    int   gpuinf = 0;
+    int   cpuinf = 0;
+    for(int i = 0; i < cpuOutput.size(); i++)
+    {
+        float err = std::abs(refOutput[i] - cpuOutput[i]);
+        maxErr    = (maxErr > err) ? maxErr : err;
+        if(std::isnan(cpuOutput[i]))
+        {
+            gpunan += 1;
+        }
+        if(std::isnan(refOutput[i]))
+        {
+            cpunan += 1;
+        }
+        if(std::isinf(cpuOutput[i]))
+        {
+            gpuinf += 1;
+        }
+        if(std::isinf(refOutput[i]))
+        {
+            cpuinf += 1;
+        }
+    }
+
+    std::cout << "----- " << title << " result"
+              << " -----" << std::endl;
+    if(gpunan)
+        std::cout << "gpunan: " << gpunan << std::endl;
+    if(cpunan)
+        std::cout << "cpunan: " << cpunan << std::endl;
+    if(gpuinf)
+        std::cout << "gpuinf: " << gpuinf << std::endl;
+    if(cpuinf)
+        std::cout << "cpuinf: " << cpuinf << std::endl;
+    std::cout << "max error : " << maxErr << std::endl;
+}
+
+int main(int argc, char** argv)
+{
+    std::size_t m{1};
+    std::size_t n{64};
+    bool        affine{false};
+
+    if(auto err = parseArgs(argc, argv, &m, &n, &affine))
+    {
+        printUsage(argv[0]);
+        return err;
+    }
+
+    std::size_t numElements     = m * n;
+    std::size_t elementNumBytes = sizeof(float);
+
+    float* gpuOutput{nullptr};
+    float* gpuMean{nullptr};
+    float* gpuInvvar{nullptr};
+    float* gpuInput{nullptr};
+    float* gpuGamma{nullptr};
+    float* gpuBeta{nullptr};
+
+    auto hipErr = hipMalloc(&gpuOutput, numElements * elementNumBytes);
+    hipErr      = hipMalloc(&gpuMean, m * elementNumBytes);
+    hipErr      = hipMalloc(&gpuInvvar, m * elementNumBytes);
+    hipErr      = hipMalloc(&gpuInput, numElements * elementNumBytes);
+    if(affine)
+    {
+        hipErr = hipMalloc(&gpuGamma, n * elementNumBytes);
+        hipErr = hipMalloc(&gpuBeta, n * elementNumBytes);
+    }
+
+    std::vector<float> cpuOutput(numElements, 0.f);
+    std::vector<float> cpuMean(m, 0.f);
+    std::vector<float> cpuInvvar(m, 0.f);
+    std::vector<float> cpuInput(numElements, 0.f);
+    std::vector<float> cpuGamma(n, 1.f);
+    std::vector<float> cpuBeta(n, 0.f);
+
+    std::vector<float> refOutput(numElements, 0.f);
+    std::vector<float> refMean(m, 0.f);
+    std::vector<float> refInvvar(m, 0.f);
+
+    randomize(begin(cpuInput), end(cpuInput));
+    if(affine)
+    {
+        randomize(begin(cpuGamma), end(cpuGamma));
+        randomize(begin(cpuBeta), end(cpuBeta));
+    }
+
+    hipErr = hipMemcpyHtoD(gpuInput, cpuInput.data(), numElements * elementNumBytes);
+    if(affine)
+    {
+        hipErr = hipMemcpyHtoD(gpuGamma, cpuGamma.data(), n * elementNumBytes);
+        hipErr = hipMemcpyHtoD(gpuBeta, cpuBeta.data(), n * elementNumBytes);
+    }
+
+    hipStream_t stream{};
+    hipErr = hipStreamCreate(&stream);
+    //warmup
+    auto hipblasltErr = hipblasltExtLayerNorm(HIPBLASLT_R_32F,
+                                              gpuOutput,
+                                              gpuMean,
+                                              gpuInvvar,
+                                              gpuInput,
+                                              m,
+                                              n,
+                                              1e-05,
+                                              gpuGamma,
+                                              gpuBeta,
+                                              stream);
+
+    hipErr = hipMemcpyDtoH(cpuOutput.data(), gpuOutput, numElements * elementNumBytes);
+    hipErr = hipMemcpyDtoH(cpuMean.data(), gpuMean, m * elementNumBytes);
+    hipErr = hipMemcpyDtoH(cpuInvvar.data(), gpuInvvar, m * elementNumBytes);
+
+    cpuLayerNorm(refOutput.data(),
+                 refMean.data(),
+                 refInvvar.data(),
+                 cpuInput.data(),
+                 m,
+                 n,
+                 1e-05,
+                 cpuGamma.data(),
+                 cpuBeta.data());
+
+    //    dumpBuffer("GPU", cpuOutput.data(), m, n);
+    //    dumpBuffer("CPU", refOutput.data(), m, n);
+
+    compare("Output", cpuOutput, refOutput);
+    compare("Mean", cpuMean, refMean);
+    compare("Invvar", cpuInvvar, refInvvar);
+
+    hipEvent_t beg, end;
+    hipErr      = hipEventCreate(&beg);
+    hipErr      = hipEventCreate(&end);
+    int numRuns = 200;
+    hipErr      = hipEventRecord(beg, stream);
+
+    for(int i = 0; i < numRuns; ++i)
+    {
+        hipblasltErr = hipblasltExtLayerNorm(HIPBLASLT_R_32F,
+                                             gpuOutput,
+                                             gpuMean,
+                                             gpuInvvar,
+                                             gpuInput,
+                                             m,
+                                             n,
+                                             1e-05,
+                                             gpuGamma,
+                                             gpuBeta,
+                                             stream);
+    }
+    hipErr = hipEventRecord(end, stream);
+    hipErr = hipEventSynchronize(end);
+    hipErr = hipStreamSynchronize(stream);
+    float dur{};
+    hipErr = hipEventElapsedTime(&dur, beg, end);
+    std::cout << "Time elapsed: " << std::to_string(dur / numRuns) << " ms\n";
+
+    hipErr = hipEventDestroy(beg);
+    hipErr = hipEventDestroy(end);
+    hipErr = hipStreamDestroy(stream);
+    hipErr = hipFree(gpuOutput);
+    hipErr = hipFree(gpuMean);
+    hipErr = hipFree(gpuInvvar);
+    hipErr = hipFree(gpuInput);
+    return 0;
+}