From 1fe88e3866cab5b0aeb297d18dd2009da82266a3 Mon Sep 17 00:00:00 2001 From: wangzhen38 Date: Mon, 20 Jun 2022 08:28:59 +0000 Subject: [PATCH 1/2] cpplint fix 2 --- .../tensorrt/plugin/matmul_op_int8_plugin.cu | 54 +++++++++++-------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.cu index 7cab12b625d23..3b31638ec875d 100644 --- a/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.cu @@ -183,13 +183,13 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs, int const ldbtransform = 32 * ((m_ + 8 - 1) / 8 * 8); int const ldctransform = 32 * n_; PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc( - (void**)&Atransform_, + reinterpret_cast & Atransform_, sizeof(int8_t) * ((k_ + 32 - 1) / 32 * 32) / 32 * ldatransform)); PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc( - (void**)&Btransform_, + reinterpret_cast & Btransform_, sizeof(int8_t) * ((k_ + 32 - 1) / 32 * 32) / 32 * ldbtransform)); PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc( - (void**)&Ctransform_, + reinterpret_cast & Ctransform_, sizeof(int8_t) * ((m_ + 32 - 1) / 32 * 32) / 32 * ldctransform)); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( @@ -303,16 +303,18 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs, for (int i = 0; i < n_; i++) { alpha_tem[i] = alpha_ * inscale_0 * inscale_1 / outscale; } - PADDLE_ENFORCE_GPU_SUCCESS( - cudaMalloc((void**)&alpha_scale_, n_ * sizeof(float))); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc( + reinterpret_cast & alpha_scale_, n_ * sizeof(float))); cudaMemcpyAsync(alpha_scale_, &alpha_tem[0], n_ * sizeof(float), cudaMemcpyHostToDevice); float zero_tem = zero; - PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_zero_, sizeof(float))); + PADDLE_ENFORCE_GPU_SUCCESS( + cudaMalloc(reinterpret_cast & alpha_zero_, sizeof(float))); cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float), cudaMemcpyHostToDevice); float one_tem = 1; - PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_one_, sizeof(float))); + PADDLE_ENFORCE_GPU_SUCCESS( + cudaMalloc(reinterpret_cast & alpha_one_, sizeof(float))); cudaMemcpyAsync(alpha_one_, &one_tem, sizeof(float), cudaMemcpyHostToDevice); } else if (type_ == nvinfer1::DataType::kHALF) { @@ -381,11 +383,13 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs, sizeof(matmul_model))); half alpha_tem = static_cast(alpha_); - PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_scale_, sizeof(half))); + PADDLE_ENFORCE_GPU_SUCCESS( + cudaMalloc(reinterpret_cast & alpha_scale_, sizeof(half))); cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(half), cudaMemcpyHostToDevice); half zero_tem = static_cast(zero); - PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_zero_, sizeof(half))); + PADDLE_ENFORCE_GPU_SUCCESS( + cudaMalloc(reinterpret_cast & alpha_zero_, sizeof(half))); cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(half), cudaMemcpyHostToDevice); } else { @@ -455,11 +459,12 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs, float alpha_tem = alpha_; PADDLE_ENFORCE_GPU_SUCCESS( - cudaMalloc((void**)&alpha_scale_, sizeof(float))); + cudaMalloc(reinterpret_cast & alpha_scale_, sizeof(float))); cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(float), cudaMemcpyHostToDevice); float zero_tem = zero; - PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_zero_, sizeof(float))); + PADDLE_ENFORCE_GPU_SUCCESS( + cudaMalloc(reinterpret_cast & alpha_zero_, sizeof(float))); cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float), cudaMemcpyHostToDevice); } @@ -611,13 +616,13 @@ void MatmulPluginDynamic::configurePlugin( int const ldbtransform = 32 * ((m_max + 8 - 1) / 8 * 8); int const ldctransform = 32 * n_max; PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc( - (void**)&Atransform_, + reinterpret_cast & Atransform_, sizeof(int8_t) * ((k_max + 32 - 1) / 32 * 32) / 32 * ldatransform)); PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc( - (void**)&Btransform_, + reinterpret_cast & Btransform_, sizeof(int8_t) * ((k_max + 32 - 1) / 32 * 32) / 32 * ldbtransform)); PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc( - (void**)&Ctransform_, + reinterpret_cast & Ctransform_, sizeof(int8_t) * ((m_max + 32 - 1) / 32 * 32) / 32 * ldctransform)); if (type_ == nvinfer1::DataType::kINT8) { @@ -625,35 +630,40 @@ void MatmulPluginDynamic::configurePlugin( for (int i = 0; i < n_max; i++) { alpha_tem[i] = alpha_ * inscale_0 * inscale_1 / outscale; } - PADDLE_ENFORCE_GPU_SUCCESS( - cudaMalloc((void**)&alpha_scale_, n_max * sizeof(float))); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc( + reinterpret_cast & alpha_scale_, n_max * sizeof(float))); cudaMemcpyAsync(alpha_scale_, &alpha_tem[0], n_max * sizeof(float), cudaMemcpyHostToDevice); float zero_tem = zero; - PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_zero_, sizeof(float))); + PADDLE_ENFORCE_GPU_SUCCESS( + cudaMalloc(reinterpret_cast & alpha_zero_, sizeof(float))); cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float), cudaMemcpyHostToDevice); float one_tem = 1; - PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_one_, sizeof(float))); + PADDLE_ENFORCE_GPU_SUCCESS( + cudaMalloc(reinterpret_cast & alpha_one_, sizeof(float))); cudaMemcpyAsync(alpha_one_, &one_tem, sizeof(float), cudaMemcpyHostToDevice); } else if (type_ == nvinfer1::DataType::kHALF) { half alpha_tem = static_cast(alpha_); - PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_scale_, sizeof(half))); + PADDLE_ENFORCE_GPU_SUCCESS( + cudaMalloc(reinterpret_cast & alpha_scale_, sizeof(half))); cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(half), cudaMemcpyHostToDevice); half zero_tem = static_cast(zero); - PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_zero_, sizeof(half))); + PADDLE_ENFORCE_GPU_SUCCESS( + cudaMalloc(reinterpret_cast & alpha_zero_, sizeof(half))); cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(half), cudaMemcpyHostToDevice); } else { float alpha_tem = alpha_; PADDLE_ENFORCE_GPU_SUCCESS( - cudaMalloc((void**)&alpha_scale_, sizeof(float))); + cudaMalloc(reinterpret_cast & alpha_scale_, sizeof(float))); cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(float), cudaMemcpyHostToDevice); float zero_tem = zero; - PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc((void**)&alpha_zero_, sizeof(float))); + PADDLE_ENFORCE_GPU_SUCCESS( + cudaMalloc(reinterpret_cast & alpha_zero_, sizeof(float))); cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float), cudaMemcpyHostToDevice); } From a7caa6d1c357886923d70a26aa574b3e59b9dcde Mon Sep 17 00:00:00 2001 From: wangzhen38 Date: Tue, 21 Jun 2022 02:19:50 +0000 Subject: [PATCH 2/2] cpplint fix 2 --- .../tensorrt/plugin/matmul_op_int8_plugin.cu | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.cu index 3b31638ec875d..1502f3913b541 100644 --- a/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.cu @@ -183,13 +183,13 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs, int const ldbtransform = 32 * ((m_ + 8 - 1) / 8 * 8); int const ldctransform = 32 * n_; PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc( - reinterpret_cast & Atransform_, + reinterpret_cast(&Atransform_), sizeof(int8_t) * ((k_ + 32 - 1) / 32 * 32) / 32 * ldatransform)); PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc( - reinterpret_cast & Btransform_, + reinterpret_cast(&Btransform_), sizeof(int8_t) * ((k_ + 32 - 1) / 32 * 32) / 32 * ldbtransform)); PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc( - reinterpret_cast & Ctransform_, + reinterpret_cast(&Ctransform_), sizeof(int8_t) * ((m_ + 32 - 1) / 32 * 32) / 32 * ldctransform)); PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutCreate( @@ -304,17 +304,17 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs, alpha_tem[i] = alpha_ * inscale_0 * inscale_1 / outscale; } PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc( - reinterpret_cast & alpha_scale_, n_ * sizeof(float))); + reinterpret_cast(&alpha_scale_), n_ * sizeof(float))); cudaMemcpyAsync(alpha_scale_, &alpha_tem[0], n_ * sizeof(float), cudaMemcpyHostToDevice); float zero_tem = zero; PADDLE_ENFORCE_GPU_SUCCESS( - cudaMalloc(reinterpret_cast & alpha_zero_, sizeof(float))); + cudaMalloc(reinterpret_cast(&alpha_zero_), sizeof(float))); cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float), cudaMemcpyHostToDevice); float one_tem = 1; PADDLE_ENFORCE_GPU_SUCCESS( - cudaMalloc(reinterpret_cast & alpha_one_, sizeof(float))); + cudaMalloc(reinterpret_cast(&alpha_one_), sizeof(float))); cudaMemcpyAsync(alpha_one_, &one_tem, sizeof(float), cudaMemcpyHostToDevice); } else if (type_ == nvinfer1::DataType::kHALF) { @@ -384,12 +384,12 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs, half alpha_tem = static_cast(alpha_); PADDLE_ENFORCE_GPU_SUCCESS( - cudaMalloc(reinterpret_cast & alpha_scale_, sizeof(half))); + cudaMalloc(reinterpret_cast(&alpha_scale_), sizeof(half))); cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(half), cudaMemcpyHostToDevice); half zero_tem = static_cast(zero); PADDLE_ENFORCE_GPU_SUCCESS( - cudaMalloc(reinterpret_cast & alpha_zero_, sizeof(half))); + cudaMalloc(reinterpret_cast(&alpha_zero_), sizeof(half))); cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(half), cudaMemcpyHostToDevice); } else { @@ -459,12 +459,12 @@ void MatmulPlugin::configurePlugin(const nvinfer1::PluginTensorDesc* inputs, float alpha_tem = alpha_; PADDLE_ENFORCE_GPU_SUCCESS( - cudaMalloc(reinterpret_cast & alpha_scale_, sizeof(float))); + cudaMalloc(reinterpret_cast(&alpha_scale_), sizeof(float))); cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(float), cudaMemcpyHostToDevice); float zero_tem = zero; PADDLE_ENFORCE_GPU_SUCCESS( - cudaMalloc(reinterpret_cast & alpha_zero_, sizeof(float))); + cudaMalloc(reinterpret_cast(&alpha_zero_), sizeof(float))); cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float), cudaMemcpyHostToDevice); } @@ -616,13 +616,13 @@ void MatmulPluginDynamic::configurePlugin( int const ldbtransform = 32 * ((m_max + 8 - 1) / 8 * 8); int const ldctransform = 32 * n_max; PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc( - reinterpret_cast & Atransform_, + reinterpret_cast(&Atransform_), sizeof(int8_t) * ((k_max + 32 - 1) / 32 * 32) / 32 * ldatransform)); PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc( - reinterpret_cast & Btransform_, + reinterpret_cast(&Btransform_), sizeof(int8_t) * ((k_max + 32 - 1) / 32 * 32) / 32 * ldbtransform)); PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc( - reinterpret_cast & Ctransform_, + reinterpret_cast(&Ctransform_), sizeof(int8_t) * ((m_max + 32 - 1) / 32 * 32) / 32 * ldctransform)); if (type_ == nvinfer1::DataType::kINT8) { @@ -631,39 +631,39 @@ void MatmulPluginDynamic::configurePlugin( alpha_tem[i] = alpha_ * inscale_0 * inscale_1 / outscale; } PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc( - reinterpret_cast & alpha_scale_, n_max * sizeof(float))); + reinterpret_cast(&alpha_scale_), n_max * sizeof(float))); cudaMemcpyAsync(alpha_scale_, &alpha_tem[0], n_max * sizeof(float), cudaMemcpyHostToDevice); float zero_tem = zero; PADDLE_ENFORCE_GPU_SUCCESS( - cudaMalloc(reinterpret_cast & alpha_zero_, sizeof(float))); + cudaMalloc(reinterpret_cast(&alpha_zero_), sizeof(float))); cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float), cudaMemcpyHostToDevice); float one_tem = 1; PADDLE_ENFORCE_GPU_SUCCESS( - cudaMalloc(reinterpret_cast & alpha_one_, sizeof(float))); + cudaMalloc(reinterpret_cast(&alpha_one_), sizeof(float))); cudaMemcpyAsync(alpha_one_, &one_tem, sizeof(float), cudaMemcpyHostToDevice); } else if (type_ == nvinfer1::DataType::kHALF) { half alpha_tem = static_cast(alpha_); PADDLE_ENFORCE_GPU_SUCCESS( - cudaMalloc(reinterpret_cast & alpha_scale_, sizeof(half))); + cudaMalloc(reinterpret_cast(&alpha_scale_), sizeof(half))); cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(half), cudaMemcpyHostToDevice); half zero_tem = static_cast(zero); PADDLE_ENFORCE_GPU_SUCCESS( - cudaMalloc(reinterpret_cast & alpha_zero_, sizeof(half))); + cudaMalloc(reinterpret_cast(&alpha_zero_), sizeof(half))); cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(half), cudaMemcpyHostToDevice); } else { float alpha_tem = alpha_; PADDLE_ENFORCE_GPU_SUCCESS( - cudaMalloc(reinterpret_cast & alpha_scale_, sizeof(float))); + cudaMalloc(reinterpret_cast(&alpha_scale_), sizeof(float))); cudaMemcpyAsync(alpha_scale_, &alpha_tem, sizeof(float), cudaMemcpyHostToDevice); float zero_tem = zero; PADDLE_ENFORCE_GPU_SUCCESS( - cudaMalloc(reinterpret_cast & alpha_zero_, sizeof(float))); + cudaMalloc(reinterpret_cast(&alpha_zero_), sizeof(float))); cudaMemcpyAsync(alpha_zero_, &zero_tem, sizeof(float), cudaMemcpyHostToDevice); }