Skip to content

Commit 6304124

Browse files
authored
Merge pull request #2825 from ROCm/tf-missing-gpu-support
Added missing gpu support for Gelu and some other ops
2 parents e5b0d2f + 0091d03 commit 6304124

File tree

9 files changed

+74
-81
lines changed

9 files changed

+74
-81
lines changed

tensorflow/core/kernels/image/image_ops.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@ limitations under the License.
1515

1616
#define EIGEN_USE_THREADS
1717

18-
#if GOOGLE_CUDA
18+
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1919
#define EIGEN_USE_GPU
20-
#endif // GOOGLE_CUDA
20+
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
2121

2222
#include "tensorflow/core/kernels/image/image_ops.h"
2323

@@ -192,7 +192,7 @@ TF_CALL_bfloat16(REGISTER);
192192

193193
#undef REGISTER
194194

195-
#if GOOGLE_CUDA
195+
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
196196

197197
typedef Eigen::GpuDevice GPUDevice;
198198
typedef generator::Mode Mode;
@@ -266,6 +266,6 @@ TF_CALL_double(REGISTER);
266266

267267
#undef REGISTER
268268

269-
#endif // GOOGLE_CUDA
269+
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
270270

271271
} // end namespace tensorflow

tensorflow/core/kernels/image/image_ops_gpu.cu.cc

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
1313
limitations under the License.
1414
==============================================================================*/
1515

16-
#if GOOGLE_CUDA
16+
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1717

1818
#define EIGEN_USE_GPU
1919

@@ -28,7 +28,6 @@ namespace functor {
2828

2929
// Explicit instantiation of the GPU functor.
3030
typedef Eigen::GpuDevice GPUDevice;
31-
3231
template class FillProjectiveTransform<GPUDevice, uint8>;
3332
template class FillProjectiveTransform<GPUDevice, int32>;
3433
template class FillProjectiveTransform<GPUDevice, int64>;
@@ -40,4 +39,4 @@ template class FillProjectiveTransform<GPUDevice, double>;
4039

4140
} // end namespace tensorflow
4241

43-
#endif // GOOGLE_CUDA
42+
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

tensorflow/core/kernels/relu_op.cc

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -116,9 +116,7 @@ TF_CALL_half(DECLARE_GPU_NO_MLIR_SPEC);
116116
TF_CALL_float(DECLARE_GPU_NO_MLIR_SPEC);
117117
TF_CALL_double(DECLARE_GPU_NO_MLIR_SPEC);
118118
#endif
119-
#if GOOGLE_CUDA //No Rocm for now
120119
TF_CALL_bfloat16(DECLARE_GPU_NO_MLIR_SPEC);
121-
#endif
122120
#undef DECLARE_GPU_NO_MLIR_SPEC
123121
} // namespace functor
124122

@@ -138,9 +136,7 @@ TF_CALL_half(REGISTER_GPU_NO_MLIR_KERNELS);
138136
TF_CALL_float(REGISTER_GPU_NO_MLIR_KERNELS);
139137
TF_CALL_double(REGISTER_GPU_NO_MLIR_KERNELS);
140138
#endif
141-
#if GOOGLE_CUDA //No Rocm for now
142139
TF_CALL_bfloat16(REGISTER_GPU_NO_MLIR_KERNELS);
143-
#endif
144140
#undef REGISTER_GPU_NO_MLIR_KERNELS
145141

146142
// Forward declarations of the functor specializations for GPU.
@@ -210,9 +206,7 @@ void Relu<GPUDevice, qint8>::operator()(
210206
extern template struct Relu<GPUDevice, qint8>;
211207

212208
TF_CALL_GPU_NUMBER_TYPES_NO_BF16(DECLARE_GPU_SPEC);
213-
#if GOOGLE_CUDA
214209
TF_CALL_bfloat16(DECLARE_GPU_SPEC);
215-
#endif
216210
} // namespace functor
217211

218212
// Registration of the GPU implementations.
@@ -246,9 +240,7 @@ TF_CALL_bfloat16(DECLARE_GPU_SPEC);
246240
SeluGradOp<GPUDevice, type>)
247241

248242
TF_CALL_GPU_NUMBER_TYPES_NO_BF16(REGISTER_GPU_KERNELS);
249-
#if GOOGLE_CUDA
250243
TF_CALL_bfloat16(REGISTER_GPU_KERNELS);
251-
#endif
252244
#undef REGISTER_GPU_KERNELS
253245

254246
template <typename Device>

tensorflow/core/kernels/relu_op_gpu.cu.cc

Lines changed: 48 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -234,68 +234,65 @@ struct Relu<Device, qint8> {
234234
reinterpret_cast<int32*>(output.data())));
235235
}
236236
};
237-
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
238237

239-
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
240238
template <class T>
241-
__global__ void GeluKernel(const T* in, T* out, int32 count) {
239+
__global__ void GeluKernel(const T* __restrict__ in,
240+
T* __restrict__ out, int32 count) {
242241
int i = threadIdx.x + blockIdx.x * blockDim.x;
243242
if (i >= count) return;
244-
const auto scale = static_cast<T>(0.7978845608028654);
245-
const auto p1 = scale;
246-
const auto p3 = static_cast<T>(0.044715 * 0.7978845608028654);
247-
T x = in[i];
248-
out[i] = 0.5 * x * (1 + tanh(p1 * x + p3 * x * x * x));
243+
244+
constexpr bool is_half = std::is_same_v<T, Eigen::half>;
245+
if constexpr(is_half || std::is_same_v<T, Eigen::bfloat16>) {
246+
using NT = std::conditional_t< is_half, half, bfloat16 >;
247+
auto *xin = reinterpret_cast<const NT*>(in);
248+
auto *xout = reinterpret_cast<NT*>(out);
249+
const float scale = 0.7978845608028654;
250+
const float p1 = scale;
251+
const float p3 = 0.044715 * 0.7978845608028654;
252+
float x = xin[i];
253+
float out = 0.5f * x * (1.f + tanh(p1 * x + p3 * x * x * x));
254+
xout[i] = static_cast<NT>(out);
255+
} else {
256+
const auto scale = static_cast<T>(0.7978845608028654);
257+
const auto p1 = scale;
258+
const auto p3 = static_cast<T>(0.044715 * 0.7978845608028654);
259+
T x = in[i];
260+
out[i] = 0.5 * x * (1. + tanh(p1 * x + p3 * x * x * x));
261+
}
249262
}
250263

251264
template <class T>
252-
__global__ void GeluGradKernel(const T* gradient, const T* feature, T* backprop,
253-
int32 count) {
265+
__global__ void GeluGradKernel(const T* __restrict__ gradient,
266+
const T* __restrict__ feature, T* __restrict__ backprop, int32 count) {
254267
int i = threadIdx.x + blockIdx.x * blockDim.x;
255268
if (i >= count) return;
256269

257-
const T p1 = static_cast<T>(0.7978845608028654);
258-
const T p3 = static_cast<T>(0.044715 * 0.7978845608028654);
259-
T x = feature[i];
260-
T z = p1 * x + p3 * x * x * x;
261-
T g = gradient[i];
262-
T cz = 1. / cosh(z);
263-
backprop[i] = static_cast<T>(
270+
constexpr bool is_half = std::is_same_v<T, Eigen::half>;
271+
if constexpr(is_half || std::is_same_v<T, Eigen::bfloat16>) {
272+
using NT = std::conditional_t< is_half, half, bfloat16 >;
273+
const float scale = 0.7978845608028654;
274+
const float p1 = scale;
275+
const float p3 = 0.044715 * 0.7978845608028654;
276+
auto *xgrad = reinterpret_cast<const NT*>(gradient);
277+
auto *xfeature = reinterpret_cast<const NT*>(feature);
278+
auto *xbackprop = reinterpret_cast<NT*>(backprop);
279+
float x = xfeature[i];
280+
float z = p1 * x + p3 * x * x * x;
281+
float g = xgrad[i];
282+
float cz = 1.f / cosh(z);
283+
float out = g * 0.5f * (1.f + tanh(z) +
284+
x * (p1 + 3 * p3 * x * x) * cz * cz);
285+
xbackprop[i] = static_cast< NT >(out);
286+
} else {
287+
const T p1 = static_cast<T>(0.7978845608028654);
288+
const T p3 = static_cast<T>(0.044715 * 0.7978845608028654);
289+
T x = feature[i];
290+
T z = p1 * x + p3 * x * x * x;
291+
T g = gradient[i];
292+
T cz = 1. / cosh(z);
293+
backprop[i] = static_cast<T>(
264294
g * 0.5 * (1. + tanh(z) + x * (p1 + 3 * p3 * x * x) * cz * cz));
265-
}
266-
267-
template <>
268-
__global__ void GeluKernel<Eigen::half>(const Eigen::half* _in,
269-
Eigen::half* _out, int32 count) {
270-
int i = threadIdx.x + blockIdx.x * blockDim.x;
271-
if (i >= count) return;
272-
const half* in = reinterpret_cast<const half*>(_in);
273-
half* out = reinterpret_cast<half*>(_out);
274-
const float scale = 0.7978845608028654;
275-
const float p1 = scale;
276-
const float p3 = 0.044715 * 0.7978845608028654;
277-
float x = in[i];
278-
out[i] = 0.5 * x * (1 + tanh(p1 * x + p3 * x * x * x));
279-
}
280-
281-
template <>
282-
__global__ void GeluGradKernel<Eigen::half>(const Eigen::half* _gradient,
283-
const Eigen::half* _feature,
284-
Eigen::half* _backprop,
285-
int32 count) {
286-
int i = threadIdx.x + blockIdx.x * blockDim.x;
287-
if (i >= count) return;
288-
const float scale = 0.7978845608028654;
289-
const float p1 = scale;
290-
const float p3 = 0.044715 * 0.7978845608028654;
291-
const half* gradient = reinterpret_cast<const half*>(_gradient);
292-
const half* feature = reinterpret_cast<const half*>(_feature);
293-
half* backprop = reinterpret_cast<half*>(_backprop);
294-
float x = feature[i];
295-
float z = p1 * x + p3 * x * x * x;
296-
float g = gradient[i];
297-
float cz = 1. / cosh(z);
298-
backprop[i] = g * 0.5 * (1. + tanh(z) + x * (p1 + 3 * p3 * x * x) * cz * cz);
295+
}
299296
}
300297

301298
template <typename T>
@@ -338,9 +335,7 @@ TF_CALL_half(DEFINE_GPU_NO_MLIR_KERNELS);
338335
TF_CALL_float(DEFINE_GPU_NO_MLIR_KERNELS);
339336
TF_CALL_double(DEFINE_GPU_NO_MLIR_KERNELS);
340337
#endif
341-
#if GOOGLE_CUDA
342338
TF_CALL_bfloat16(DEFINE_GPU_NO_MLIR_KERNELS);
343-
#endif
344339
#undef DEFINE_GPU_NO_MLIR_KERNELS
345340

346341
// Definition of the GPU implementations declared in relu_op.cc.
@@ -356,9 +351,7 @@ TF_CALL_bfloat16(DEFINE_GPU_NO_MLIR_KERNELS);
356351
template struct functor::GeluGrad<GPUDevice, T>;
357352

358353
TF_CALL_GPU_NUMBER_TYPES_NO_BF16(DEFINE_GPU_KERNELS);
359-
#if GOOGLE_CUDA
360354
TF_CALL_bfloat16(DEFINE_GPU_KERNELS);
361-
#endif
362355
template struct functor::Relu<GPUDevice, qint8>;
363356

364357
} // end namespace tensorflow

tensorflow/core/kernels/stateless_random_gamma_op_gpu.cu.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,4 +183,4 @@ TF_CALL_double(REGISTER_GPU_SPEC);
183183

184184
} // namespace tensorflow
185185

186-
#endif // GOOGLE_CUDA
186+
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

tensorflow/core/kernels/unique_op_gpu.cu.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -449,6 +449,6 @@ class UniqueOpGPU : public AsyncOpKernel {
449449

450450
} // end namespace tensorflow
451451

452-
#endif // GOOGLE_CUDA
452+
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
453453

454454
#endif // TENSORFLOW_CORE_KERNELS_UNIQUE_OP_GPU_CU_H_

tensorflow/core/kernels/unique_op_gpu_0.cu.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
1313
limitations under the License.
1414
==============================================================================*/
1515

16-
#if GOOGLE_CUDA
16+
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1717

1818
#include "tensorflow/core/framework/register_types.h"
1919
#include "tensorflow/core/kernels/unique_op_gpu.cu.h"

tensorflow/core/kernels/unique_op_gpu_1.cu.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
1313
limitations under the License.
1414
==============================================================================*/
1515

16-
#if GOOGLE_CUDA
16+
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1717

1818
#include "tensorflow/core/framework/register_types.h"
1919
#include "tensorflow/core/kernels/unique_op_gpu.cu.h"
@@ -39,4 +39,4 @@ TF_CALL_FLOAT_TYPES(REGISTER_UNIQUE_GPU);
3939

4040
} // end namespace tensorflow
4141

42-
#endif // GOOGLE_CUDA
42+
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

tensorflow/python/kernel_tests/nn_ops/relu_op_test.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -70,9 +70,9 @@ def testNumbersGPU(self):
7070
self.skipTest("No GPU available")
7171
for t in [
7272
np.float16,
73+
dtypes.bfloat16.as_numpy_dtype,
7374
np.float32,
7475
np.float64,
75-
dtypes.bfloat16.as_numpy_dtype,
7676
]:
7777
self._testRelu(
7878
np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
@@ -550,17 +550,25 @@ def testNumbersCPU(self):
550550
def testNumbersGPU(self):
551551
if not test.is_gpu_available():
552552
self.skipTest("No GPU available")
553-
for t in [np.float16, np.float32, np.float64]:
553+
for t in [np.float16, dtypes.bfloat16.as_numpy_dtype, np.float32, np.float64]:
554554
self._testGelu(np.array([[-9, 7, -5, 3, -1],
555555
[1, -3, 5, -7, 9]]).astype(t))
556556

557557
def testGradients(self):
558-
for t in [np.float16, np.float32, np.float64]:
558+
for t in [np.float16, dtypes.bfloat16.as_numpy_dtype, np.float32, np.float64]:
559+
560+
is_f16 = t == np.float16
561+
is_bf16 = t == dtypes.bfloat16.as_numpy_dtype
559562
for gpu in [True, False]:
560563
if gpu and not test.is_gpu_available():
561564
continue
562-
delta = 2e-2 if t == np.float16 else 1e-3
563-
tol = 2e-2 if t == np.float16 else (1e-4 if t == np.float32 else 1e-6)
565+
delta = 2e-2 if is_f16 or is_bf16 else 1e-3
566+
tol = 3e-2 if is_bf16 else \
567+
2e-2 if is_f16 else \
568+
1e-4 if t == np.float32 else 1e-6
569+
if is_bf16 and not gpu:
570+
tol = 0.1 # really bad accuracy on CPU for bf16
571+
564572
def approx_gelu(x):
565573
return nn_ops.gelu(x, approximate=True)
566574
with self.session(use_gpu=gpu):
@@ -571,7 +579,8 @@ def approx_gelu(x):
571579
err = gradient_checker_v2.max_error(
572580
e1, e2)
573581
print(e1, e2)
574-
print("gelu", t, "GPU" if gpu else "CPU", "gradient err = ", err)
582+
print("gelu", t, "GPU" if gpu else "CPU", \
583+
"gradient err = ", err, " tol = ", tol)
575584
self.assertLess(err, tol)
576585

577586
class SeluTest(test.TestCase):

0 commit comments

Comments
 (0)