Implement the grad and enhance the cache of norm_convolution fusion o…

…ps. (#36168)
PaddlePaddle · Sep 29, 2021 · 767050d · 767050d
1 parent b3d2dc7
commit 767050d
Show file tree

Hide file tree

Showing 4 changed files with 630 additions and 253 deletions.
diff --git a/paddle/fluid/framework/operator_kernel_configs.h b/paddle/fluid/framework/operator_kernel_configs.h
@@ -15,8 +15,10 @@ limitations under the License. */
 #pragma once
 
 #include <algorithm>
+#include <mutex>
 #include <unordered_map>
 #include <vector>
+#include "glog/logging.h"
 
 namespace paddle {
 namespace framework {

diff --git a/paddle/fluid/operators/fused/cudnn_fusion_helper.h b/paddle/fluid/operators/fused/cudnn_fusion_helper.h
@@ -14,10 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include <string>
 #include <vector>
-#include "paddle/fluid/platform/cudnn_desc.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/framework/operator_kernel_configs.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -41,12 +39,9 @@ class CudnnFusionOp {
   }
 
   ~CudnnFusionOp() {
-    // New 'fused op' descriptor destruction
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        dynload::cudnnDestroyFusedOpsVariantParamPack(op_variant_params_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        dynload::cudnnDestroyFusedOpsConstParamPack(op_const_params_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyFusedOpsPlan(op_));
+    dynload::cudnnDestroyFusedOpsVariantParamPack(op_variant_params_);
+    dynload::cudnnDestroyFusedOpsConstParamPack(op_const_params_);
+    dynload::cudnnDestroyFusedOpsPlan(op_);
   }
 
   // Execute fused op
@@ -121,41 +116,49 @@ class CudnnFusionOp {
 
   // Get the workspace, which is required before Execute().
   size_t GetWorkspaceSizeInBytes(cudnnHandle_t cudnn_handle) {
-    size_t workspace_bytes = 0U;
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnMakeFusedOpsPlan(
-        cudnn_handle, op_, op_const_params_, &workspace_bytes));
-    plan_created_ = true;
-    return workspace_bytes;
+    if (!plan_created_) {
+      workspace_bytes_ = 0U;
+      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnMakeFusedOpsPlan(
+          cudnn_handle, op_, op_const_params_, &workspace_bytes_));
+      plan_created_ = true;
+    }
+    return workspace_bytes_;
   }
 
  private:
   bool plan_created_;
+  size_t workspace_bytes_;
 
   cudnnFusedOpsPlan_t op_;
   cudnnFusedOpsConstParamPack_t op_const_params_;
   cudnnFusedOpsVariantParamPack_t op_variant_params_;
 };
 
-static inline std::vector<int> GetStrides(const std::vector<int> &shape) {
-  if (shape.size() < 1) {
-    return {};
+class CudnnFusionOpCache {
+ public:
+  static CudnnFusionOpCache &Instance() {
+    static CudnnFusionOpCache instance;
+    return instance;
+  }
+
+  framework::AlgorithmsCache<CudnnFusionOp *> *GetForward() {
+    return &forward_cache_;
   }
-  int dim = static_cast<int>(shape.size());
-  std::vector<int> pro_shape(shape);
-  std::vector<int> strides(dim);
-  int temp = pro_shape[1];
-  pro_shape.erase(pro_shape.begin() + 1);
-  pro_shape.push_back(temp);
-  strides.back() = 1;
-  for (int i = dim - 2; i >= 0; --i) {
-    strides[i] = strides[i + 1] * pro_shape[i + 1];
+  framework::AlgorithmsCache<CudnnFusionOp *> *GetBackward() {
+    return &backward_cache_;
   }
-  strides.pop_back();
-  strides.insert(strides.begin() + 1, 1);
-  return strides;
-}
 
-static inline int64_t AlignUp(int64_t a, int64_t b) { return (a + b - 1) / b; }
+ private:
+  CudnnFusionOpCache() {}
+  ~CudnnFusionOpCache() {
+    // Need to delete the memory of cache.
+  }
+  CudnnFusionOpCache(const CudnnFusionOpCache &) {}
+
+ private:
+  framework::AlgorithmsCache<CudnnFusionOp *> forward_cache_;
+  framework::AlgorithmsCache<CudnnFusionOp *> backward_cache_;
+};
 
 #endif  // CUDNN_VERSION >= 8000
 }  // namespace operators