From 46a50e80251a85340ef3327c0ffc5da2197a1abb Mon Sep 17 00:00:00 2001
From: Lee Killough <killough@leekillough.com>
Date: Wed, 17 Jul 2019 18:24:35 -0400
Subject: [PATCH 1/2] Use a similar method as in rocBLAS to determine whether
 to use C++ and HIP features in tensile_bfloat16.h

---
 Tensile/Source/tensile_bfloat16.h | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)
diff --git a/Tensile/Source/tensile_bfloat16.h b/Tensile/Source/tensile_bfloat16.h
index ccb1ba50f..aa8325b3c 100644
--- a/Tensile/Source/tensile_bfloat16.h
+++ b/Tensile/Source/tensile_bfloat16.h
@@ -1,7 +1,7 @@
 /**
  * MIT License
  *
- * Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright 2019 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -30,22 +30,23 @@
 #ifndef _TENSILE_BFLOAT16_H_
 #define _TENSILE_BFLOAT16_H_
 
-#ifndef __cplusplus
-
-#include <inttypes.h>
+// If this is a C compiler, C++ compiler below C++11, or a host-only compiler,
+// we only include a minimal definition of tensile_bfloat16
+#if __cplusplus < 201103L || !defined(__HCC__)
 
+#include <stdint.h>
 typedef struct
 {
     uint16_t data;
 } tensile_bfloat16;
 
-#else // __cplusplus
-
-#include <hip/hip_runtime_api.h>
+#else // __cplusplus < 201103L || !defined(__HCC__)
 
-#include <cinttypes>
 #include <cmath>
-#include <iostream>
+#include <cstddef>
+#include <cstdint>
+#include <hip/hip_runtime.h>
+#include <ostream>
 #include <type_traits>
 
 struct tensile_bfloat16
@@ -57,7 +58,7 @@ struct tensile_bfloat16
     __host__ __device__ tensile_bfloat16() {}
 
     // round upper 16 bits of IEEE float to convert to bfloat16
-    explicit __host__ __device__ tensile_bfloat16(float f) : data(float_to_bfloat16(f)) {}
+    explicit __host__ __device__ tensile_bfloat16(float f) : data(float_to_bfloat16(f)) { }
 
     // zero extend lower 16 bits of bfloat16 to convert to IEEE float
     explicit __host__ __device__ operator float() const
@@ -65,17 +66,17 @@ struct tensile_bfloat16
         union
         {
             uint32_t int32;
-            float fp32;
+            float    fp32;
         } u = {uint32_t(data) << 16};
         return u.fp32;
     }
 
-    private:
+private:
     static __host__ __device__ uint16_t float_to_bfloat16(float f)
     {
         union
         {
-            float fp32;
+            float    fp32;
             uint32_t int32;
         } u = {f};
         if(~u.int32 & 0x7f800000)
@@ -191,6 +192,6 @@ inline __host__ __device__ tensile_bfloat16 abs(tensile_bfloat16 a)
 inline tensile_bfloat16 sin(tensile_bfloat16 a) { return tensile_bfloat16(sinf(float(a))); }
 inline tensile_bfloat16 cos(tensile_bfloat16 a) { return tensile_bfloat16(cosf(float(a))); }
 
-#endif // __cplusplus
+#endif // __cplusplus < 201103L || !defined(__HCC__)
 
 #endif // _TENSILE_BFLOAT16_H_

From 61bb3916cf87065f7e3f9f89f1db7cb50127410f Mon Sep 17 00:00:00 2001
From: Lee Killough <killough@leekillough.com>
Date: Sun, 1 Sep 2019 17:30:57 -0400
Subject: [PATCH 2/2] Make bfloat16 changes similar to rocBLAS PR #678

---
 Tensile/Source/MathTemplates.cpp  | 39 ++++++++++++++-----------------
 Tensile/Source/tensile_bfloat16.h |  7 +-----
 2 files changed, 19 insertions(+), 27 deletions(-)

diff --git a/Tensile/Source/MathTemplates.cpp b/Tensile/Source/MathTemplates.cpp
index d24960fe1..5268c2bb1 100644
--- a/Tensile/Source/MathTemplates.cpp
+++ b/Tensile/Source/MathTemplates.cpp
@@ -86,15 +86,15 @@ template<> TensileComplexDouble tensileGetOne<TensileComplexDouble>() {
 #ifdef Tensile_ENABLE_HALF
 template<> TensileHalf tensileGetRandom<TensileHalf>() { return static_cast<TensileHalf>((rand()%7) - 3); }
 #endif
-template<> uint32_t tensileGetRandom<uint32_t>() { 
-   int8_t t0 = static_cast<int8_t>((rand()%7) - 3); 
-   int8_t t1 = static_cast<int8_t>((rand()%7) - 3); 
-   int8_t t2 = static_cast<int8_t>((rand()%7) - 3); 
-   int8_t t3 = static_cast<int8_t>((rand()%7) - 3); 
+template<> uint32_t tensileGetRandom<uint32_t>() {
+   int8_t t0 = static_cast<int8_t>((rand()%7) - 3);
+   int8_t t1 = static_cast<int8_t>((rand()%7) - 3);
+   int8_t t2 = static_cast<int8_t>((rand()%7) - 3);
+   int8_t t3 = static_cast<int8_t>((rand()%7) - 3);
    int8_t t1x4[4] = {t0, t1, t2, t3};
-   uint32_t tmp; 
+   uint32_t tmp;
    memcpy(&tmp, t1x4, sizeof(uint32_t));
-   return tmp; 
+   return tmp;
 }
 template<> int32_t tensileGetRandom<int32_t>() { return static_cast<int32_t>((rand()%7) - 3); }
 template<> float tensileGetRandom<float>() { return static_cast<float>((rand()%201) - 100); }
@@ -142,20 +142,20 @@ template<> TensileComplexDouble tensileGetTypeForInt<TensileComplexDouble>( size
 #ifdef Tensile_ENABLE_HALF
 template<> TensileHalf tensileGetTrig<TensileHalf>(int i) { return static_cast<TensileHalf>(sin(i)); }
 #endif
-template<> uint32_t tensileGetTrig<uint32_t>(int i) { 
-   int8_t t0 = static_cast<int8_t>((rand()%7) - 3); 
-   int8_t t1 = static_cast<int8_t>((rand()%7) - 3); 
-   int8_t t2 = static_cast<int8_t>((rand()%7) - 3); 
-   int8_t t3 = static_cast<int8_t>((rand()%7) - 3); 
+template<> uint32_t tensileGetTrig<uint32_t>(int i) {
+   int8_t t0 = static_cast<int8_t>((rand()%7) - 3);
+   int8_t t1 = static_cast<int8_t>((rand()%7) - 3);
+   int8_t t2 = static_cast<int8_t>((rand()%7) - 3);
+   int8_t t3 = static_cast<int8_t>((rand()%7) - 3);
    int8_t t1x4[4] = {t0, t1, t2, t3};
-   uint32_t tmp; 
+   uint32_t tmp;
    memcpy(&tmp, t1x4, sizeof(uint32_t));
-   return tmp; 
+   return tmp;
 }
-template<> int32_t tensileGetTrig<int32_t>(int i) { return static_cast<int32_t>((rand()%7) - 3); }
-template<> float tensileGetTrig<float>(int i) { return static_cast<float>(sin(i)); }
-template<> tensile_bfloat16 tensileGetTrig<tensile_bfloat16>(int i) { return sin(static_cast<tensile_bfloat16>(i)); }
-template<> double tensileGetTrig<double>(int i) { return static_cast<double>(sin(i)); }
+template<> int32_t tensileGetTrig<int32_t>(int i) { return rand() % 7 - 3; }
+template<> float tensileGetTrig<float>(int i) { return sin(i); }
+template<> tensile_bfloat16 tensileGetTrig<tensile_bfloat16>(int i) { return tensile_bfloat16(sinf(i)); }
+template<> double tensileGetTrig<double>(int i) { return sin(i); }
 template<> TensileComplexFloat tensileGetTrig<TensileComplexFloat>(int i) {
   TensileComplexFloat r;
   TENSILEREAL(r) = tensileGetTrig<float>(i);
@@ -531,6 +531,3 @@ template<> std::string tensileToString(TensileHalf v){
 #endif
 template<> std::string tensileToString(tensile_bfloat16 v){
   return tensileToString(static_cast<float>(v)); }
-
-
-
diff --git a/Tensile/Source/tensile_bfloat16.h b/Tensile/Source/tensile_bfloat16.h
index c806b6884..3bef81a58 100644
--- a/Tensile/Source/tensile_bfloat16.h
+++ b/Tensile/Source/tensile_bfloat16.h
@@ -61,7 +61,7 @@ struct tensile_bfloat16
     explicit __host__ __device__ tensile_bfloat16(float f) : data(float_to_bfloat16(f)) { }
 
     // zero extend lower 16 bits of bfloat16 to convert to IEEE float
-    explicit __host__ __device__ operator float() const
+    __host__ __device__ operator float() const
     {
         union
         {
@@ -184,11 +184,6 @@ inline __host__ __device__ tensile_bfloat16 operator--(tensile_bfloat16& a, int)
 inline __host__ __device__ bool isinf(tensile_bfloat16 a) { return !(~a.data & 0x7f80) && !(a.data & 0x7f); }
 inline __host__ __device__ bool isnan(tensile_bfloat16 a) { return !(~a.data & 0x7f80) && +(a.data & 0x7f); }
 inline __host__ __device__ bool iszero(tensile_bfloat16 a) { return !(a.data & 0x7fff); }
-inline __host__ __device__ tensile_bfloat16 abs(tensile_bfloat16 a)
-{
-    a.data &= 0x7fff;
-    return a;
-}
 inline tensile_bfloat16 sin(tensile_bfloat16 a) { return tensile_bfloat16(sinf(float(a))); }
 inline tensile_bfloat16 cos(tensile_bfloat16 a) { return tensile_bfloat16(cosf(float(a))); }