diff --git a/paddle/common/hash_funcs.h b/paddle/common/hash_funcs.h
new file mode 100644
index 0000000000000..e4a905ff539b9
--- /dev/null
+++ b/paddle/common/hash_funcs.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+inline void HashCombine(std::size_t* seed) {}
+
+// combine hash value
+// https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x
+template <typename T, typename... Rest>
+inline void HashCombine(std::size_t* seed, const T& v, Rest... rest) {
+  std::hash<T> hasher;
+  *seed ^= hasher(v) + 0x9e3779b9 + (*seed << 6) + (*seed >> 2);
+  *seed *= 0x00000100000001B3;
+  HashCombine(seed, rest...);
+}
+
+// custom specialization of std::hash can be injected in namespace std
+// ref: https://en.cppreference.com/w/cpp/utility/hash
+namespace std {
+template <typename T>
+struct hash<std::vector<T>> {
+  std::size_t operator()(std::vector<T> const& vec) const noexcept {
+    std::size_t seed = 0xcbf29ce484222325;
+    for (auto val : vec) {
+      HashCombine(&seed, val);
+    }
+    return seed;
+  }
+};
+}  // namespace std
diff --git a/paddle/fluid/framework/custom_operator_utils.h b/paddle/fluid/framework/custom_operator_utils.h
index ec00e8b9d0d6b..bf1750dfdbbb5 100644
--- a/paddle/fluid/framework/custom_operator_utils.h
+++ b/paddle/fluid/framework/custom_operator_utils.h
@@ -19,10 +19,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/string/string_helper.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
+#include "paddle/phi/core/enforce.h"
 
 namespace paddle {
 namespace framework {
-
+constexpr char kCustomDialectPrefix[] = "custom_op.";  // NOLINT
 namespace detail {
 
 // dynamic lib load func
@@ -81,6 +82,31 @@ inline static bool IsMemberOf(const std::vector<std::string>& vec,
   return std::find(vec.cbegin(), vec.cend(), name) != vec.cend();
 }
 
+inline static const OpMetaInfo& GetOpInfoByPirName(
+    const std::string& pir_op_name) {
+  auto custom_name = pir_op_name.substr(strlen(kCustomDialectPrefix));
+  int pos = custom_name.length();
+  if (custom_name.find("_grad_grad") != custom_name.npos) {
+    pos = custom_name.find("_grad_grad") + 1;
+  } else if (custom_name.find("_grad") != custom_name.npos) {
+    pos = custom_name.find("_grad") + 1;
+  }
+  auto custom_name_prefix = custom_name.substr(0, pos);
+  auto map_iter =
+      paddle::OpMetaInfoMap::Instance().GetMap().find(custom_name_prefix);
+  if (map_iter == paddle::OpMetaInfoMap::Instance().GetMap().end()) {
+    PADDLE_THROW("The info of custom op : " + custom_name + " is not exists!");
+  }
+  const auto& vec_op_meta = map_iter->second;
+  if (custom_name.find("_grad_grad") != custom_name.npos) {
+    return vec_op_meta[2];
+  } else if (custom_name.find("_grad") != custom_name.npos) {
+    return vec_op_meta[1];
+  } else {
+    return vec_op_meta[0];
+  }
+}
+
 }  // namespace detail
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/instruction/custom_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/custom_kernel_instruction.cc
new file mode 100644
index 0000000000000..a585976fd6b9a
--- /dev/null
+++ b/paddle/fluid/framework/new_executor/instruction/custom_kernel_instruction.cc
@@ -0,0 +1,499 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/new_executor/instruction/custom_kernel_instruction.h"
+#include "paddle/fluid/framework/custom_operator_utils.h"
+#include "paddle/fluid/framework/new_executor/instruction/instruction_util.h"
+#include "paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h"
+#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
+#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/core/value.h"
+
+namespace paddle {
+namespace framework {
+
+void CustomKernelInstruction::BuildCustomContext(
+    const paddle::dialect::OpYamlInfoParser& op_yaml_info) {
+  Scope* inner_scope = value_exec_info_.GetScope();
+  VLOG(6) << "Build custom op infermeta param inner_scope[" << inner_scope
+          << "]";
+
+  auto attr_map = op_->attributes();
+
+  // EmplaceBackInputs
+  auto& vec_input_tensor_params = op_yaml_info.TensorParams(true);
+  auto& name2id = op_yaml_info.InputName2Id();
+  for (auto& t : vec_input_tensor_params) {
+    PADDLE_ENFORCE_EQ(
+        name2id.count(t),
+        true,
+        phi::errors::NotFound("param [%s] MUST in name2id map", t));
+
+    pir::Value ptr = op_->operand_source(op_yaml_info.InputName2Id().at(t));
+
+    if (!IsInvalid(ptr)) {
+      if (op_yaml_info.GetInputType(op_yaml_info.InputName2Id().at(t)) ==
+          "pir::VectorType<paddle::dialect::DenseTensorType>") {
+        vec_input_shapes_.emplace_back();
+        vec_input_dtypes_.emplace_back();
+        // NOTE(YuanRisheng): In dygraph mode, we can not distinguish Tensor and
+        // vector<Tensor> when user inputs None, so dygraph mode appends one
+        // un-initialized Tensor to CustomOpKernelContext. To be compatible with
+        // dygraph mode, `custom_vec_in` also emplace_back one un-initialized
+        // tensor here.
+        std::vector<paddle::Tensor> custom_vec_in;
+        custom_vec_in.emplace_back(paddle::Tensor());
+        custom_kernel_ctx_.EmplaceBackInputs(std::move(custom_vec_in));
+      } else {
+        input_shapes_.emplace_back();
+        input_dtypes_.emplace_back();
+        custom_kernel_ctx_.EmplaceBackInput(std::move(paddle::Tensor()));
+      }
+      VLOG(8) << "ctx->EmplaceBackInput : an optioanl input " << t;
+      continue;
+    }
+
+    auto in_var_name = value_exec_info_.GetVarName(ptr);
+    VLOG(6) << "ctx->EmplaceBackInput: " << t << "\t" << in_var_name;
+
+    PADDLE_ENFORCE_NOT_NULL(inner_scope->FindVar(in_var_name),
+                            phi::errors::PreconditionNotMet(
+                                "can not find var[%s] in scope", in_var_name));
+    auto var = inner_scope->FindVar(in_var_name);
+    if (var->IsType<phi::DenseTensor>()) {
+      auto dense_tensor_in = var->GetMutable<phi::DenseTensor>();
+      std::shared_ptr<phi::DenseTensor> tensor_in(
+          dense_tensor_in, [](phi::DenseTensor* ptr) {
+            VLOG(6) << ptr << " ptr will not be deleted by shared_ptr";
+          });
+      input_shapes_.push_back(phi::vectorize(tensor_in->dims()));
+      input_dtypes_.push_back(tensor_in->dtype());
+      paddle::Tensor custom_in;
+      custom_in.set_impl(tensor_in);
+      custom_kernel_ctx_.EmplaceBackInput(std::move(custom_in));
+    } else if (var->IsType<VariableRefArray>()) {
+      std::vector<std::vector<int64_t>> vec_input_shape;
+      std::vector<phi::DataType> vec_input_dtype;
+      std::vector<paddle::Tensor> vec_custom_in;
+      auto& variable_array = var->Get<VariableRefArray>();
+      for (size_t i = 0; i < variable_array.size(); ++i) {
+        if (variable_array[i]->IsType<phi::DenseTensor>()) {
+          phi::DenseTensor* dense_tensor_in = const_cast<phi::DenseTensor*>(
+              &(variable_array[i]->Get<phi::DenseTensor>()));
+          std::shared_ptr<phi::DenseTensor> tensor_in(
+              dense_tensor_in, [](phi::DenseTensor* ptr) {
+                VLOG(6) << ptr << " ptr will not be deleted by shared_ptr";
+              });
+          vec_input_shape.push_back(phi::vectorize(tensor_in->dims()));
+          vec_input_dtype.push_back(tensor_in->dtype());
+          paddle::Tensor custom_in;
+          custom_in.set_impl(tensor_in);
+          vec_custom_in.push_back(std::move(custom_in));
+        } else {
+          PADDLE_THROW(phi::errors::Unimplemented(
+              "Only support Vector<DenseTensor> and vector<SelectedRows> now, "
+              "not support vector<%d>.",
+              variable_array[i]->Type()));
+        }
+      }
+      vec_input_shapes_.push_back(vec_input_shape);
+      vec_input_dtypes_.push_back(vec_input_dtype);
+      custom_kernel_ctx_.EmplaceBackInputs(vec_custom_in);
+    } else {
+      PADDLE_THROW(phi::errors::Unimplemented("Not support var type [%d] ",
+                                              var->Type()));
+    }
+  }
+
+  // EmplaceBackAttributes
+  auto& vec_attr_params = op_yaml_info.AttrParams(true);
+  for (auto& t : vec_attr_params) {
+    PADDLE_ENFORCE_NE(
+        attr_map.find(t),
+        attr_map.end(),
+        phi::errors::NotFound("Not found %s in attr_map, it maybe need mapping "
+                              "it in OpTranslator.",
+                              t));
+    auto& attr_type_name = op_yaml_info.AttrTypeName(t);
+    if (attr_type_name == "pir::Int32Attribute") {
+      custom_attrs_.push_back(
+          attr_map[t].dyn_cast<pir::Int32Attribute>().data());
+      custom_kernel_ctx_.EmplaceBackAttr(
+          attr_map[t].dyn_cast<pir::Int32Attribute>().data());
+    } else if (attr_type_name == "pir::Int64Attribute") {
+      custom_attrs_.push_back(
+          attr_map[t].dyn_cast<pir::Int64Attribute>().data());
+      custom_kernel_ctx_.EmplaceBackAttr(
+          attr_map[t].dyn_cast<pir::Int64Attribute>().data());
+    } else if (attr_type_name == "pir::FloatAttribute") {
+      custom_attrs_.push_back(
+          attr_map[t].dyn_cast<pir::FloatAttribute>().data());
+      custom_kernel_ctx_.EmplaceBackAttr(
+          attr_map[t].dyn_cast<pir::FloatAttribute>().data());
+    } else if (attr_type_name == "pir::DoubleAttribute") {
+      custom_attrs_.push_back(
+          attr_map[t].dyn_cast<pir::DoubleAttribute>().data());
+      custom_kernel_ctx_.EmplaceBackAttr(
+          attr_map[t].dyn_cast<pir::DoubleAttribute>().data());
+    } else if (attr_type_name == "pir::BoolAttribute") {
+      custom_attrs_.push_back(
+          attr_map[t].dyn_cast<pir::BoolAttribute>().data());
+      custom_kernel_ctx_.EmplaceBackAttr(
+          attr_map[t].dyn_cast<pir::BoolAttribute>().data());
+    } else if (attr_type_name == "pir::StrAttribute") {
+      custom_attrs_.push_back(
+          attr_map[t].dyn_cast<pir::StrAttribute>().AsString());
+      custom_kernel_ctx_.EmplaceBackAttr(
+          attr_map[t].dyn_cast<pir::StrAttribute>().AsString());
+    } else if (attr_type_name == "pir::ArrayAttribute<pir::Int32Attribute>") {
+      auto array_list = attr_map[t].dyn_cast<pir::ArrayAttribute>().AsVector();
+      std::vector<int32_t> vec_res;
+      if (array_list.size() > 0) {
+        PADDLE_ENFORCE_EQ(
+            array_list[0].isa<pir::Int32Attribute>(),
+            true,
+            phi::errors::Unimplemented(
+                "the 0th elementwise MUST be pir::Int32Attribute"));
+        for (size_t i = 0; i < array_list.size(); ++i) {
+          vec_res.push_back(
+              array_list[i].dyn_cast<pir::Int32Attribute>().data());
+        }
+      }
+      custom_attrs_.push_back(vec_res);
+      custom_kernel_ctx_.EmplaceBackAttr(vec_res);
+    } else if (attr_type_name == "pir::ArrayAttribute<pir::FloatAttribute>") {
+      auto array_list = attr_map[t].dyn_cast<pir::ArrayAttribute>().AsVector();
+      std::vector<float> vec_res;
+      if (array_list.size() > 0) {
+        if (array_list[0].isa<pir::FloatAttribute>()) {
+          for (size_t i = 0; i < array_list.size(); ++i) {
+            vec_res.push_back(
+                array_list[i].dyn_cast<pir::FloatAttribute>().data());
+          }
+
+        } else {
+          PADDLE_THROW(phi::errors::Unimplemented("attr type not support [%s] ",
+                                                  attr_type_name));
+        }
+      }
+      custom_attrs_.push_back(vec_res);
+      custom_kernel_ctx_.EmplaceBackAttr(vec_res);
+    } else if (attr_type_name == "pir::ArrayAttribute<pir::Int64Attribute>") {
+      auto array_list = attr_map[t].dyn_cast<pir::ArrayAttribute>().AsVector();
+
+      std::vector<int64_t> vec_res;
+      if (array_list.size() > 0) {
+        PADDLE_ENFORCE_EQ(
+            array_list[0].isa<pir::Int64Attribute>(),
+            true,
+            phi::errors::PreconditionNotMet(
+                "Element in array list MUST be pir::Int64Attribute "));
+
+        for (size_t i = 0; i < array_list.size(); ++i) {
+          vec_res.push_back(
+              array_list[i].dyn_cast<pir::Int64Attribute>().data());
+        }
+      }
+      custom_attrs_.push_back(vec_res);
+      custom_kernel_ctx_.EmplaceBackAttr(vec_res);
+    } else if (attr_type_name == "pir::ArrayAttribute<pir::StrAttribute>") {
+      auto array_list = attr_map[t].dyn_cast<pir::ArrayAttribute>().AsVector();
+
+      std::vector<std::string> vec_res;
+      if (array_list.size() > 0) {
+        PADDLE_ENFORCE_EQ(
+            array_list[0].isa<pir::StrAttribute>(),
+            true,
+            phi::errors::PreconditionNotMet(
+                "Element in array list MUST be pir::StrAttribute "));
+
+        for (size_t i = 0; i < array_list.size(); ++i) {
+          vec_res.push_back(
+              array_list[i].dyn_cast<pir::StrAttribute>().AsString());
+        }
+      }
+      custom_attrs_.push_back(vec_res);
+      custom_kernel_ctx_.EmplaceBackAttr(vec_res);
+
+    } else {
+      PADDLE_THROW(phi::errors::Unimplemented("attr type not support [%s] ",
+                                              attr_type_name));
+    }
+    VLOG(6) << "ctx->EmplaceBackAttr: " << t;
+  }
+
+  // EmplaceBackOutputs
+  VLOG(8) << "ctx->EmplaceBackOutput: ";
+  for (size_t i = 0; i < op_->num_results(); ++i) {
+    pir::Value out_ptr = op_->result(i);
+    if (!IsInvalid(out_ptr)) {
+      if (op_yaml_info.GetOutputType(i) ==
+          "pir::VectorType<paddle::dialect::DenseTensorType>") {
+        std::vector<paddle::Tensor> custom_vec_out;
+        custom_vec_out.emplace_back();
+        cache_out_ptrs_.emplace_back(nullptr);
+        custom_kernel_ctx_.EmplaceBackOutputs(std::move(custom_vec_out));
+      } else {
+        cache_out_ptrs_.emplace_back(nullptr);
+        custom_kernel_ctx_.EmplaceBackOutput(std::move(paddle::Tensor()));
+      }
+      VLOG(8) << "ctx->EmplaceBackOutput : an optioanl output";
+      continue;
+    }
+
+    if (out_ptr.type().isa<paddle::dialect::AllocatedDenseTensorType>()) {
+      auto dense_tensor_out =
+          inner_scope->FindVar(value_exec_info_.GetVarName(out_ptr))
+              ->GetMutable<phi::DenseTensor>();
+      cache_out_ptrs_.push_back(dense_tensor_out);
+      std::shared_ptr<phi::DenseTensor> tensor_out(
+          dense_tensor_out, [](phi::DenseTensor* ptr) {
+            VLOG(6) << ptr << " ptr will not be deleted by shared_ptr";
+          });
+      paddle::Tensor custom_out;
+      // here only can copy the output tensor into context
+      custom_out.set_impl(tensor_out);
+
+      custom_kernel_ctx_.EmplaceBackOutput(std::move(custom_out));
+      VLOG(8) << "ctx->EmplaceBackOutput DenseTensor: "
+              << value_exec_info_.GetVarName(out_ptr);
+    } else if (out_ptr.type().isa<pir::VectorType>()) {
+      std::vector<paddle::Tensor> vec_custom_out;
+      auto& variable_array =
+          inner_scope->FindVar(value_exec_info_.GetVarName(out_ptr))
+              ->Get<VariableRefArray>();
+      std::vector<paddle::Tensor> custom_vec_out;
+      for (size_t i = 0; i < variable_array.size(); ++i) {
+        if (variable_array[i]->IsType<phi::DenseTensor>()) {
+          auto dense_tensor_out = const_cast<phi::DenseTensor*>(
+              &(variable_array[i]->Get<phi::DenseTensor>()));
+          cache_out_ptrs_.emplace_back(dense_tensor_out);
+          std::shared_ptr<phi::DenseTensor> tensor_out(
+              dense_tensor_out, [](phi::DenseTensor* ptr) {
+                VLOG(6) << ptr << " ptr will not be deleted by shared_ptr";
+              });
+          paddle::Tensor custom_out;
+          custom_out.set_impl(tensor_out);
+          custom_vec_out.push_back(std::move(custom_out));
+        } else {
+          PADDLE_THROW(phi::errors::Unimplemented(
+              "Only support Vector<DenseTensor> and vector<SelectedRows> now, "
+              "not support vector<%d>.",
+              variable_array[i]->Type()));
+        }
+      }
+      VLOG(8) << "ctx->EmplaceBackOutput VariableRefArray: "
+              << value_exec_info_.GetVarName(out_ptr);
+      custom_kernel_ctx_.EmplaceBackOutputs(custom_vec_out);
+    } else {
+      PADDLE_THROW(
+          phi::errors::Unimplemented("only support DenseTensor and vector "));
+    }
+  }
+  auto& op_inputs = OpMetaInfoHelper::GetInputs(*custom_op_meta_);
+  auto& op_outputs = OpMetaInfoHelper::GetOutputs(*custom_op_meta_);
+  auto& op_inplace_map = OpMetaInfoHelper::GetInplaceMap(*custom_op_meta_);
+  // handle inplace map
+  custom_kernel_ctx_.UpdatePlainOutputs(op_inputs, op_outputs, op_inplace_map);
+  VLOG(6) << "Done build custom context";
+}
+
+CustomKernelInstruction::CustomKernelInstruction(
+    size_t id,
+    const platform::Place& place,
+    pir::Operation* op,
+    const ValueExecutionInfo& value_exec_info)
+    : InstructionBase(id, place), value_exec_info_(value_exec_info) {
+  auto op_attributes = op->attributes();
+  auto op_name =
+      op_attributes.at("op_name").dyn_cast<pir::StrAttribute>().AsString();
+  pir::OpInfo op_info =
+      pir::IrContext::Instance()->GetRegisteredOpInfo(op_name);
+  op_ = op;
+  custom_op_name_ = op_name;
+  VLOG(6) << "construct custom kernel instruction for: " << custom_op_name_;
+
+  VLOG(6) << "finish process dist attributes";
+
+  SetKernelType(AnalyseOpFuncType(op, place));
+  VLOG(6) << "finish process analyse kernel type";
+
+  auto yaml_interface =
+      op_info.GetInterfaceImpl<paddle::dialect::OpYamlInfoInterface>();
+  PADDLE_ENFORCE_NOT_NULL(
+      yaml_interface,
+      phi::errors::PreconditionNotMet(
+          "can not find OpYamlInfoInterface from [%s]", custom_op_name_));
+  paddle::dialect::OpYamlInfoParser yaml_info_parser(
+      yaml_interface->get_op_info_(custom_op_name_),
+      paddle::dialect::IsLegacyOp(custom_op_name_));
+  VLOG(6) << "finish process yaml_info_parser";
+
+  const auto& op_meta =
+      paddle::framework::detail::GetOpInfoByPirName(custom_op_name_);
+  custom_op_meta_ = &op_meta;
+  infershape_func_ = OpMetaInfoHelper::GetInferShapeFn(op_meta);
+  inferdtype_func_ = OpMetaInfoHelper::GetInferDtypeFn(op_meta);
+  kernel_func_ = OpMetaInfoHelper::GetKernelFn(op_meta);
+  BuildCustomContext(yaml_info_parser);
+  VLOG(6) << "finish process custom context";
+  auto kernel_key = op_attributes.at("kernel_key")
+                        .dyn_cast<paddle::dialect::KernelAttribute>()
+                        .data();
+  SetDeviceContext(
+      ParseDeviceContext(op,
+                         phi::DeviceContextPool::Instance().Get(
+                             phi::TransToPhiPlace(kernel_key.backend())),
+                         place,
+                         GetExecutionStream(),
+                         GetStreamPriority()));
+  VLOG(6) << "finish process device context";
+
+  InitInputsOutputsIds(op, value_exec_info_);
+  VLOG(6) << "finish process inputs outputs index";
+
+  auto& no_need_buffer_ids = yaml_info_parser.NoNeedBufferIds();
+  std::unordered_set<pir::Value> no_need_buffer_values;
+  for (size_t id = 0; id < no_need_buffer_ids.size(); id++) {
+    no_need_buffer_values.insert(op->operand_source(no_need_buffer_ids[id]));
+  }
+  SetNoNeedBuffer(no_need_buffer_values);
+  VLOG(6) << "finish process no need buffer";
+}
+
+void CustomKernelInstruction::UpdateOutputMeta(
+    const std::vector<std::vector<int64_t>>& output_shapes,
+    const std::vector<DataType>& output_dtypes) {
+  PADDLE_ENFORCE_EQ(
+      output_shapes.size(),
+      cache_out_ptrs_.size(),
+      phi::errors::InvalidArgument(
+          "The number of output shapes after running custom operator's "
+          "InferShapeFunc is wrong, "
+          "expected contains %d Tensors' shape, but actually contains %d "
+          "Tensors' shape",
+          cache_out_ptrs_.size(),
+          output_shapes.size()));
+
+  PADDLE_ENFORCE_EQ(
+      output_dtypes.size(),
+      cache_out_ptrs_.size(),
+      phi::errors::InvalidArgument(
+          "The number of output dtypes after running custom operator's "
+          "InferDtypeFunc is wrong, "
+          "expected contains %d Tensors' dtype, but actually contains %d "
+          "Tensors' dtype",
+          cache_out_ptrs_.size(),
+          output_dtypes.size()));
+
+  for (size_t i = 0; i < cache_out_ptrs_.size(); ++i) {
+    auto out_in_scope = cache_out_ptrs_.at(i);
+    // update dims and dtype
+    auto out_meta = phi::DenseTensorUtils::GetMutableMeta(out_in_scope);
+    out_meta->dims = phi::make_ddim(output_shapes[i]);
+    out_meta->dtype = output_dtypes[i];
+  }
+}
+
+void CustomKernelInstruction::Run() {
+  VLOG(3) << "Custom Operator: InferShape - calc output ddim.";
+  std::vector<std::vector<int64_t>> output_shapes;
+  std::vector<phi::DataType> output_dtypes;
+  if (infershape_func_) {
+    output_shapes =
+        infershape_func_(input_shapes_, vec_input_shapes_, custom_attrs_);
+  } else {
+    PADDLE_ENFORCE_EQ(
+        OpMetaInfoHelper::GetInputs(*custom_op_meta_).size(),
+        1UL,
+        phi::errors::Unavailable(
+            "Your custom operator contains multiple inputs. "
+            "We only allow a custom operator that contains only one input "
+            "and only one output without setting the InferShapeFn. "
+            "At this time, the input shape will be directly set to "
+            "the output shape.\n"
+            "Please set the InferShapeFn of custom "
+            "operator by .SetInferShapeFn(PD_INFER_SHAPE(...))"));
+    PADDLE_ENFORCE_EQ(
+        OpMetaInfoHelper::GetOutputs(*custom_op_meta_).size(),
+        1UL,
+        phi::errors::Unavailable(
+            "Your custom operator contains multiple outputs. "
+            "We only allow a custom operator that contains only one input "
+            "and only one output without setting the InferShapeFn. "
+            "At this time, the input shape will be directly set to "
+            "the output shape.\n"
+            "Please set the InferShapeFn of custom "
+            "operator by .SetInferShapeFn(PD_INFER_SHAPE(...))"));
+
+    VLOG(3) << "Custom Operator: Default InferShape - share ddim.";
+    if (input_shapes_.size() == 1) {
+      output_shapes = input_shapes_;
+    } else if (vec_input_shapes_.size() == 1) {
+      output_shapes = vec_input_shapes_[0];
+    } else {
+      PADDLE_THROW(phi::errors::Unavailable(
+          "We only allow a custom operator that contains only one input "
+          "and only one output without setting the InferShapeFn. "));
+    }
+  }
+
+  if (inferdtype_func_) {
+    output_dtypes =
+        inferdtype_func_(input_dtypes_, vec_input_dtypes_, custom_attrs_);
+  } else {
+    PADDLE_ENFORCE_EQ(
+        OpMetaInfoHelper::GetInputs(*custom_op_meta_).size(),
+        1UL,
+        phi::errors::Unavailable(
+            "Your custom operator contains multiple inputs. "
+            "We only allow a custom operator that contains only one input "
+            "and only one output without setting the InferDtypeFn. "
+            "At this time, the input dtype will be directly set to "
+            "the output dtype.\n"
+            "Please set the InferDtypeFn of custom "
+            "operator by `.SetInferDtypeFn(PD_INFER_DTYPE(...))`"));
+    PADDLE_ENFORCE_EQ(
+        OpMetaInfoHelper::GetOutputs(*custom_op_meta_).size(),
+        1UL,
+        phi::errors::Unavailable(
+            "Your custom operator contains multiple outputs. "
+            "We only allow a custom operator that contains only one input "
+            "and only one output without setting the InferDtypeFn. "
+            "At this time, the input dtype will be directly set to "
+            "the output dtype.\n"
+            "Please set the InferDtypeFn of custom "
+            "operator by `.SetInferDtypeFn(PD_INFER_DTYPE(...))`"));
+
+    VLOG(3) << "Custom Operator: InferDtype - share dtype.";
+    if (input_dtypes_.size() == 1) {
+      output_dtypes = input_dtypes_;
+    } else if (vec_input_dtypes_.size() == 1) {
+      output_dtypes = vec_input_dtypes_[0];
+    } else {
+      PADDLE_THROW(phi::errors::Unavailable(
+          "We only allow a custom operator that contains only one input "
+          "and only one output without setting the InferDtypeFn. "));
+    }
+  }
+  UpdateOutputMeta(output_shapes, output_dtypes);
+
+  VLOG(6) << "Run custom op " << custom_op_name_ << " kernel.";
+  kernel_func_(&custom_kernel_ctx_);
+  custom_kernel_ctx_.AssignInplaceOutputs();
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/instruction/custom_kernel_instruction.h b/paddle/fluid/framework/new_executor/instruction/custom_kernel_instruction.h
new file mode 100644
index 0000000000000..6c6a7d90ae8f0
--- /dev/null
+++ b/paddle/fluid/framework/new_executor/instruction/custom_kernel_instruction.h
@@ -0,0 +1,78 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/new_executor/instruction/instruction_base.h"
+#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h"
+#include "paddle/phi/api/ext/op_meta_info.h"
+
+namespace pir {
+class Operation;
+}  // namespace pir
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+class CustomKernelInstruction : public InstructionBase {
+ public:
+  CustomKernelInstruction(size_t id,
+                          const platform::Place& place,
+                          ::pir::Operation* op,
+                          const ValueExecutionInfo& value_exec_info);
+
+  ::pir::Operation* Operation() const override { return op_; }
+
+  void Run() override;
+
+  const std::string& Name() const override { return custom_op_name_; }
+
+  void clear();
+
+ private:
+  void BuildCustomContext(
+      const paddle::dialect::OpYamlInfoParser& op_yaml_info);
+
+  void UpdateOutputMeta(const std::vector<std::vector<int64_t>>& output_shapes,
+                        const std::vector<DataType>& output_dtypes);
+
+  paddle::CustomOpKernelContext custom_kernel_ctx_;
+
+  paddle::InferShapeFunc infershape_func_ = nullptr;
+  paddle::InferDtypeFunc inferdtype_func_ = nullptr;
+  paddle::KernelFunc kernel_func_ = nullptr;
+
+  // use for runing infershape
+  std::vector<std::vector<int64_t>> input_shapes_;
+  std::vector<std::vector<std::vector<int64_t>>> vec_input_shapes_;
+  std::vector<paddle::any> custom_attrs_;
+
+  // use for runing inferdtype
+  std::vector<DataType> input_dtypes_;
+  std::vector<std::vector<DataType>> vec_input_dtypes_;
+
+  // use for update output
+  std::vector<phi::DenseTensor*> cache_out_ptrs_;
+
+  std::string custom_op_name_;
+
+  ::pir::Operation* op_{nullptr};  // not owned
+
+  const paddle::OpMetaInfo* custom_op_meta_;   // not owned
+  const ValueExecutionInfo& value_exec_info_;  // not owned
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc
index 1dc779b1d43a9..812f86704ee50 100644
--- a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc
@@ -106,7 +106,8 @@ LegacyKernelInstruction::LegacyKernelInstruction(
       phi::errors::PreconditionNotMet(
           "can not find OpYamlInfoInterface from [%s]", legacy_op_name_));
   paddle::dialect::OpYamlInfoParser yaml_info_parser(
-      yaml_interface->get_op_info_(), paddle::dialect::IsLegacyOp(op_name));
+      yaml_interface->get_op_info_(op_name),
+      paddle::dialect::IsLegacyOp(op_name));
   VLOG(6) << "finish process yaml_info_parser";
 
   if (infer_meta_interface_) {
diff --git a/paddle/fluid/framework/new_executor/instruction/onednn/onednn_phi_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/onednn/onednn_phi_kernel_instruction.cc
index 71385619cb958..fb8407a1a7ea3 100644
--- a/paddle/fluid/framework/new_executor/instruction/onednn/onednn_phi_kernel_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/onednn/onednn_phi_kernel_instruction.cc
@@ -216,7 +216,7 @@ OneDNNPhiKernelInstruction::OneDNNPhiKernelInstruction(
       phi::errors::PreconditionNotMet(
           "can not find OpYamlInfoInterface from [%s]", phi_op_name_));
   paddle::dialect::OpYamlInfoParser yaml_info_parser(
-      yaml_interface->get_op_info_(),
+      yaml_interface->get_op_info_(op_name),
       paddle::dialect::IsOneDNNLegacyOp(op_name));
   VLOG(6) << "finish process yaml_info_parser";
 
diff --git a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc
index 798735f24058d..ed5bee9ce8777 100644
--- a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc
@@ -110,7 +110,8 @@ PhiKernelInstruction::PhiKernelInstruction(
       phi::errors::PreconditionNotMet(
           "can not find OpYamlInfoInterface from [%s]", phi_op_name_));
   paddle::dialect::OpYamlInfoParser yaml_info_parser(
-      yaml_interface->get_op_info_(), paddle::dialect::IsLegacyOp(op_name));
+      yaml_interface->get_op_info_(op_name),
+      paddle::dialect::IsLegacyOp(op_name));
   VLOG(6) << "finish process yaml_info_parser";
 
   if (infer_meta_interface_) {
diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
index 614b97c26b7b0..0a111922d4409 100644
--- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
@@ -1304,7 +1304,7 @@ std::vector<std::string> GetOriginInputNames(const std::string& op_name) {
   if (op_info.GetInterfaceImpl<paddle::dialect::OpYamlInfoInterface>()) {
     paddle::dialect::OpYamlInfoParser yaml_parser(
         op_info.GetInterfaceImpl<paddle::dialect::OpYamlInfoInterface>()
-            ->get_op_info_());
+            ->get_op_info_(op_name));
     ret = yaml_parser.InputNames();
   }
   return ret;
@@ -1317,7 +1317,7 @@ std::vector<std::string> GetOriginOutputNames(const std::string& op_name) {
   if (op_info.GetInterfaceImpl<paddle::dialect::OpYamlInfoInterface>()) {
     paddle::dialect::OpYamlInfoParser yaml_parser(
         op_info.GetInterfaceImpl<paddle::dialect::OpYamlInfoInterface>()
-            ->get_op_info_());
+            ->get_op_info_(op_name));
     ret = yaml_parser.OutputNames();
   }
   return ret;
diff --git a/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc b/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc
index a06abb197de5f..be32e1f473a1b 100644
--- a/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc
+++ b/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc
@@ -658,7 +658,7 @@ void HandleForInplaceOp(pir::Operation* op,
   pir::OpInfo op_info = ctx->GetRegisteredOpInfo(op_name);
   paddle::dialect::OpYamlInfoParser yaml_parser(
       op_info.GetInterfaceImpl<paddle::dialect::OpYamlInfoInterface>()
-          ->get_op_info_(),
+          ->get_op_info_(op_name),
       paddle::dialect::IsLegacyOp(op_name));
 
   for (size_t i = 0; i < op->num_results(); ++i) {
diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc
index 2afdfb5e9717a..19e3d6e86ebde 100644
--- a/paddle/fluid/framework/new_executor/pir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc
@@ -56,6 +56,7 @@
 #include "paddle/fluid/framework/new_executor/instruction/control_flow/tuple_pop_instruction.h"
 #include "paddle/fluid/framework/new_executor/instruction/control_flow/tuple_push_instruction.h"
 #include "paddle/fluid/framework/new_executor/instruction/control_flow/while_instruction.h"
+#include "paddle/fluid/framework/new_executor/instruction/custom_kernel_instruction.h"
 #include "paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h"
 #include "paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h"
 #include "paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h"
@@ -749,6 +750,10 @@ void PirInterpreter::BuildInstruction() {
     } else if (op.dialect()->name() == "cinn_runtime") {
       CREATE_INSTR(CinnJitInstruction);
 #endif
+    } else if (op.dialect()->name() == "custom_kernel") {
+      vec_instruction_base_.emplace_back(
+          std::make_unique<CustomKernelInstruction>(
+              op_idx++, place_, &op, *(value_exe_info_.get())));
     } else {
       PADDLE_THROW(platform::errors::Unimplemented(
           "Now only support pd_kernel and cinn dialect."));
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 4af55a7c6c933..4b52ceb58ff77 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -783,7 +783,6 @@ bool AnalysisPredictor::PrepareExecutor() {
     auto output_names = GetOutputNames();
     execution_config.skip_gc_vars.insert(output_names.begin(),
                                          output_names.end());
-
     if (FLAGS_enable_pir_in_executor) {
       pir_program_ = std::move(
           paddle::TranslateLegacyProgramToProgram(*inference_program_));
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index 0cca4532a0ce6..778ce2055e0b5 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -7,6 +7,7 @@ option(WITH_STATIC_LIB
 option(USE_TENSORRT "Compile demo with TensorRT." OFF)
 option(WITH_ONNXRUNTIME "Compile demo with ONNXRuntime" OFF)
 option(WITH_SHARED_PHI "Compile demo with phi shared lib" ON)
+option(CUSTOM_OPERATOR_FILES "List of file names for custom operators" "")
 
 if(NOT WITH_STATIC_LIB)
   add_definitions("-DPADDLE_WITH_SHARED_LIB")
@@ -252,6 +253,18 @@ if(WITH_GPU)
   endif()
 endif()
 
+if(CUSTOM_OPERATOR_FILES)
+  if(WITH_GPU AND NOT APPLE)
+    add_definitions("-DPADDLE_WITH_CUDA")
+    enable_language(CUDA)
+    find_package(CUDA REQUIRED)
+    include_directories("${CUDA_INCLUDE_DIRS}")
+  endif()
+  add_library(pd_infer_custom_op SHARED ${CUSTOM_OPERATOR_FILES})
+  target_link_libraries(pd_infer_custom_op ${DEPS})
+  set(DEPS ${DEPS} pd_infer_custom_op)
+endif()
+
 add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
 target_link_libraries(${DEMO_NAME} ${DEPS})
 if(WIN32)
diff --git a/paddle/fluid/inference/api/demo_ci/custom_op_demo.cc b/paddle/fluid/inference/api/demo_ci/custom_op_demo.cc
new file mode 100644
index 0000000000000..b4c8cccb8e790
--- /dev/null
+++ b/paddle/fluid/inference/api/demo_ci/custom_op_demo.cc
@@ -0,0 +1,64 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <numeric>
+
+#include "paddle_inference_api.h"  //NOLINT
+
+DEFINE_string(modeldir, "", "Directory of the inference model.");
+
+using paddle_infer::Config;
+using paddle_infer::CreatePredictor;
+using paddle_infer::Predictor;
+
+void run(Predictor *predictor,
+         const std::vector<float> &input,
+         const std::vector<int> &input_shape,
+         std::vector<float> *out_data) {
+  auto input_names = predictor->GetInputNames();
+  auto input_t = predictor->GetInputHandle(input_names[0]);
+  input_t->Reshape(input_shape);
+  input_t->CopyFromCpu(input.data());
+
+  CHECK(predictor->Run());
+
+  auto output_names = predictor->GetOutputNames();
+  auto output_t = predictor->GetOutputHandle(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  int out_num = std::accumulate(
+      output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+
+  out_data->resize(out_num);
+  output_t->CopyToCpu(out_data->data());
+}
+
+int main(int argc, char **argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  paddle::AnalysisConfig config;
+  config.EnableUseGpu(100, 0);
+  config.SetModel(FLAGS_modeldir + "/custom_relu.pdmodel",
+                  FLAGS_modeldir + "/custom_relu.pdiparams");
+  config.EnableNewExecutor(true);
+  auto predictor{paddle_infer::CreatePredictor(config)};
+  std::vector<int> input_shape = {1, 1, 28, 28};
+  std::vector<float> input_data(1 * 1 * 28 * 28, 1);
+  std::vector<float> out_data;
+  run(predictor.get(), input_data, input_shape, &out_data);
+  for (auto e : out_data) {
+    LOG(INFO) << e << '\n';
+  }
+  return 0;
+}
diff --git a/paddle/fluid/inference/api/demo_ci/custom_relu_op.cc b/paddle/fluid/inference/api/demo_ci/custom_relu_op.cc
new file mode 100755
index 0000000000000..e55b943a5568f
--- /dev/null
+++ b/paddle/fluid/inference/api/demo_ci/custom_relu_op.cc
@@ -0,0 +1,105 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <vector>
+
+#include "paddle/extension.h"
+
+template <typename data_t>
+void relu_cpu_forward_kernel(const data_t* x_data,
+                             data_t* out_data,
+                             int64_t x_numel) {
+  for (int i = 0; i < x_numel; ++i) {
+    out_data[i] = std::max(static_cast<data_t>(0.), x_data[i]);
+  }
+}
+
+template <typename data_t>
+void relu_cpu_backward_kernel(const data_t* grad_out_data,
+                              const data_t* out_data,
+                              data_t* grad_x_data,
+                              int64_t out_numel) {
+  for (int i = 0; i < out_numel; ++i) {
+    grad_x_data[i] =
+        grad_out_data[i] * (out_data[i] > static_cast<data_t>(0) ? 1. : 0.);
+  }
+}
+
+std::vector<paddle::Tensor> relu_cpu_forward(const paddle::Tensor& x) {
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
+
+  PD_DISPATCH_FLOATING_TYPES(
+      x.type(), "relu_cpu_forward", ([&] {
+        relu_cpu_forward_kernel<data_t>(
+            x.data<data_t>(), out.mutable_data<data_t>(x.place()), x.size());
+      }));
+
+  return {out};
+}
+
+std::vector<paddle::Tensor> relu_cpu_backward(const paddle::Tensor& x,
+                                              const paddle::Tensor& out,
+                                              const paddle::Tensor& grad_out) {
+  auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
+
+  PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward", ([&] {
+                               relu_cpu_backward_kernel<data_t>(
+                                   grad_out.data<data_t>(),
+                                   out.data<data_t>(),
+                                   grad_x.mutable_data<data_t>(x.place()),
+                                   out.size());
+                             }));
+
+  return {grad_x};
+}
+
+std::vector<paddle::Tensor> relu_cuda_forward(const paddle::Tensor& x);
+std::vector<paddle::Tensor> relu_cuda_backward(const paddle::Tensor& x,
+                                               const paddle::Tensor& out,
+                                               const paddle::Tensor& grad_out);
+
+std::vector<paddle::Tensor> ReluForward(const paddle::Tensor& x) {
+  // TODO(chenweihang): Check Input
+  if (x.place() == paddle::PlaceType::kCPU) {
+    return relu_cpu_forward(x);
+  } else if (x.place() == paddle::PlaceType::kGPU) {
+    return relu_cuda_forward(x);
+  } else {
+    throw std::runtime_error("Not implemented.");
+  }
+}
+
+std::vector<paddle::Tensor> ReluBackward(const paddle::Tensor& x,
+                                         const paddle::Tensor& out,
+                                         const paddle::Tensor& grad_out) {
+  // TODO(chenweihang): Check Input
+  if (x.place() == paddle::PlaceType::kCPU) {
+    return relu_cpu_backward(x, out, grad_out);
+  } else if (x.place() == paddle::PlaceType::kGPU) {
+    return relu_cuda_backward(x, out, grad_out);
+  } else {
+    throw std::runtime_error("Not implemented.");
+  }
+}
+
+PD_BUILD_OP(custom_relu)
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(ReluForward));
+
+PD_BUILD_GRAD_OP(custom_relu)
+    .Inputs({"X", "Out", paddle::Grad("Out")})
+    .Outputs({paddle::Grad("X")})
+    .SetKernelFn(PD_KERNEL(ReluBackward));
diff --git a/paddle/fluid/inference/api/demo_ci/custom_relu_op.cu b/paddle/fluid/inference/api/demo_ci/custom_relu_op.cu
new file mode 100644
index 0000000000000..a4b7fcf06bce6
--- /dev/null
+++ b/paddle/fluid/inference/api/demo_ci/custom_relu_op.cu
@@ -0,0 +1,71 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/extension.h"
+
+template <typename data_t>
+__global__ void relu_cuda_forward_kernel(const data_t* x,
+                                         data_t* y,
+                                         const int num) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
+    y[i] = max(x[i], static_cast<data_t>(0.));
+  }
+}
+
+template <typename data_t>
+__global__ void relu_cuda_backward_kernel(const data_t* dy,
+                                          const data_t* y,
+                                          data_t* dx,
+                                          const int num) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
+    dx[i] = dy[i] * (y[i] > 0 ? 1. : 0.);
+  }
+}
+
+std::vector<paddle::Tensor> relu_cuda_forward(const paddle::Tensor& x) {
+  auto out = paddle::Tensor(paddle::PlaceType::kGPU, x.shape());
+
+  int numel = x.size();
+  int block = 512;
+  int grid = (numel + block - 1) / block;
+  PD_DISPATCH_FLOATING_TYPES(
+      x.type(), "relu_cuda_forward_kernel", ([&] {
+        relu_cuda_forward_kernel<data_t><<<grid, block, 0, x.stream()>>>(
+            x.data<data_t>(), out.mutable_data<data_t>(x.place()), numel);
+      }));
+
+  return {out};
+}
+
+std::vector<paddle::Tensor> relu_cuda_backward(const paddle::Tensor& x,
+                                               const paddle::Tensor& out,
+                                               const paddle::Tensor& grad_out) {
+  auto grad_x = paddle::Tensor(paddle::PlaceType::kGPU, x.shape());
+
+  int numel = out.size();
+  int block = 512;
+  int grid = (numel + block - 1) / block;
+  PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cuda_backward_kernel", ([&] {
+                               relu_cuda_backward_kernel<data_t>
+                                   <<<grid, block, 0, x.stream()>>>(
+                                       grad_out.data<data_t>(),
+                                       out.data<data_t>(),
+                                       grad_x.mutable_data<data_t>(x.place()),
+                                       numel);
+                             }));
+
+  return {grad_x};
+}
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index 50112b20f29a0..795b414258b56 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -102,6 +102,17 @@ else
     wget -q http://paddle-inference-dist.bj.bcebos.com/word2vec.inference.model.tar.gz
     tar xzf *.tar.gz
 fi
+cd ..
+
+#download custom_op_demo data
+mkdir -p custom_op
+cd custom_op
+if [[ -e "custom_relu_infer_model.tgz" ]]; then
+  echo "custom_relu_infer_model.tgz has been downloaded."
+else
+    wget -q https://paddle-inference-dist.bj.bcebos.com/inference_demo/custom_operator/custom_relu_infer_model.tgz
+    tar xzf *.tgz
+fi
 
 # compile and test the demo
 cd $current_dir
@@ -275,6 +286,28 @@ for WITH_STATIC_LIB in ON OFF; do
         EXIT_CODE=1
       fi
     fi
+
+    # --------custom op demo on linux/mac------
+    if [ $TEST_GPU_CPU == ON -a $WITH_STATIC_LIB == OFF ]; then
+      rm -rf *
+      CUSTOM_OPERATOR_FILES="custom_relu_op.cc;custom_relu_op.cu"
+      cmake .. -DPADDLE_LIB=${inference_install_dir} \
+        -DWITH_MKL=$TURN_ON_MKL \
+        -DDEMO_NAME=custom_op_demo \
+        -DWITH_GPU=$TEST_GPU_CPU \
+        -DWITH_STATIC_LIB=OFF \
+        -DUSE_TENSORRT=$USE_TENSORRT \
+        -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \
+        -DCUSTOM_OPERATOR_FILES=$CUSTOM_OPERATOR_FILES \
+        -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME
+      make -j$(nproc)
+      FLAGS_enable_pir_in_executor=1 ./custom_op_demo \
+        --modeldir=$DATA_DIR/custom_op/custom_relu_infer_model
+      if [ $? -ne 0 ]; then
+        echo "custom_op_demo runs failed " >> ${current_dir}/test_summary.txt
+        EXIT_CODE=1
+      fi
+    fi    
   fi
 done
 
diff --git a/paddle/fluid/inference/api/helper.cc b/paddle/fluid/inference/api/helper.cc
index 3fd8ed490fe45..44d7a75cae21a 100644
--- a/paddle/fluid/inference/api/helper.cc
+++ b/paddle/fluid/inference/api/helper.cc
@@ -16,8 +16,13 @@
 
 #include "paddle/fluid/framework/custom_operator.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
+#include "paddle/phi/core/flags.h"
+#include "paddle/pir/core/ir_context.h"
+
+PHI_DECLARE_bool(enable_pir_in_executor);
 
 namespace paddle {
 namespace inference {
@@ -49,6 +54,21 @@ void RegisterAllCustomOperator() {
   auto &op_meta_info_map = OpMetaInfoMap::Instance();
   const auto &meta_info_map = op_meta_info_map.GetMap();
   for (auto &pair : meta_info_map) {
+    if (FLAGS_enable_pir_in_executor) {
+      ::pir::IrContext *ctx = ::pir::IrContext::Instance();
+      auto *custom_dialect =
+          ctx->GetOrRegisterDialect<paddle::dialect::CustomOpDialect>();
+      if (custom_dialect->HasRegistered(pair.first)) {
+        LOG(INFO) << "The operator `" << pair.first
+                  << "` has been registered. "
+                     "Therefore, we will not repeat the registration here.";
+        continue;
+      }
+      for (const auto &meta_info : pair.second) {
+        LOG(INFO) << "register pir custom op :" << pair.first;
+        custom_dialect->RegisterCustomOp(meta_info);
+      }
+    }
     const auto &all_op_kernels{framework::OperatorWithKernel::AllOpKernels()};
     if (all_op_kernels.find(pair.first) == all_op_kernels.end()) {
       framework::RegisterOperatorWithMetaInfo(pair.second);
diff --git a/paddle/fluid/ir_adaptor/translator/op_translator.cc b/paddle/fluid/ir_adaptor/translator/op_translator.cc
index c64004c7191dd..68e9a89cefb76 100644
--- a/paddle/fluid/ir_adaptor/translator/op_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/op_translator.cc
@@ -83,6 +83,7 @@ constexpr char kTargetDialectPrefix[] = "pd_op.";  // NOLINT
 #ifdef PADDLE_WITH_DNNL
 constexpr char kOneDNNTargetDialectPrefix[] = "pd_onednn_op.";  // NOLINT
 #endif
+constexpr char kCustomOpDialectPrefix[] = "custom_op.";
 constexpr char kEmptyVarName[] = "@EMPTY@";  // NOLINT
 
 static const std::unordered_set<std::string> SpecialNonInplaceOps = {};
@@ -229,16 +230,27 @@ inline pir::Operation* InsertCreateArrayOp(pir::IrContext* ctx,
   return create_array_op.operation();
 }
 
+inline bool HasOpInfo(pir::IrContext* ctx,
+                      const OpDesc& op_desc,
+                      std::string prefix) {
+  std::string target_op_name = prefix + OpNameCompatibleMapping(op_desc.Type());
+  if (IsInplace(op_desc) && *target_op_name.rbegin() != '_') {
+    target_op_name += "_";
+  }
+  auto op_info = ctx->GetRegisteredOpInfo(target_op_name);
+  if (op_info) {
+    return true;
+  }
+  return false;
+}
+
 inline std::string GetPrefix(pir::IrContext* ctx, const OpDesc& op_desc) {
+  if (HasOpInfo(ctx, op_desc, kCustomOpDialectPrefix)) {
+    return kCustomOpDialectPrefix;
+  }
 #ifdef PADDLE_WITH_DNNL
   if (op_desc.GetAttrIfExists<bool>("use_mkldnn")) {
-    std::string target_op_name =
-        kOneDNNTargetDialectPrefix + OpNameCompatibleMapping(op_desc.Type());
-    if (IsInplace(op_desc) && *target_op_name.rbegin() != '_') {
-      target_op_name += "_";
-    }
-    auto op_info = ctx->GetRegisteredOpInfo(target_op_name);
-    if (!op_info) {
+    if (!HasOpInfo(ctx, op_desc, kOneDNNTargetDialectPrefix)) {
       VLOG(3) << op_desc.Type()
               << "'s use_mkldnn == True, but PIR not support OneDNN for this "
                  "op right now.";
@@ -284,7 +296,7 @@ pir::OpInfo OpTranscriber::LoopkUpOpInfo(pir::IrContext* ctx,
   OpAttributeInfoList attr_infos;
   OpOutputInfoList output_infos;
   std::tie(input_infos, attr_infos, output_infos, std::ignore, std::ignore) =
-      op_info_concept->get_op_info_();
+      op_info_concept->get_op_info_(op_info.name());
 
   auto& op_normalizer = OpNameNormalizer::instance();
   std::vector<std::string> need_inputs_sig;
@@ -355,9 +367,6 @@ pir::OpInfo OpTranscriber::LoopkUpOpInfo(pir::IrContext* ctx,
   if (IsInplace(op_desc) && *target_op_name.rbegin() != '_') {
     target_op_name += "_";
   }
-  VLOG(6) << "[op name normalizing]: " << op_desc.Type() << " to "
-          << target_op_name;
-  op_info = ctx->GetRegisteredOpInfo(target_op_name);
   if (!op_info) {
     IR_THROW("Op %d should have corresponding OpInfo %d",
              op_desc.Type(),
@@ -792,8 +801,9 @@ pir::Operation* OpTranscriber::operator()(pir::IrContext* ctx,
   OpInputInfoList input_infos;
   OpAttributeInfoList attr_infos;
   OpOutputInfoList output_infos;
+
   std::tie(input_infos, attr_infos, output_infos, std::ignore, std::ignore) =
-      op_info_concept->get_op_info_();
+      op_info_concept->get_op_info_(op_info.name());
 
   this->InsertSliceOperationForInput(
       ctx, param_map, op_desc, input_infos, block);
@@ -810,7 +820,6 @@ pir::Operation* OpTranscriber::operator()(pir::IrContext* ctx,
       this->TranslateOpAttribute(ctx, op_info.name(), attr_infos, op_desc);
   TranslateOpDistAttribute(op_desc, &attribute_map);
   VLOG(4) << "[general op][" << op_desc.Type() << "] preparation end.";
-
   pir::Operation* operation = pir::Operation::Create(
       op_inputs, attribute_map, op_output_types, op_info);
   VLOG(4) << "[general op][" << op_desc.Type() << "] opearation creation end.";
@@ -940,7 +949,7 @@ struct AssignValueOpTranscriber : public OpTranscriber {
     OpAttributeInfoList attr_infos;
     OpOutputInfoList output_infos;
     std::tie(input_infos, attr_infos, output_infos, std::ignore, std::ignore) =
-        op_info_concept->get_op_info_();
+        op_info_concept->get_op_info_(op_info.name());
     std::unordered_map<std::string, OpAttributeInfo> attr_info_maps;
     for (auto const& info : attr_infos) {
       attr_info_maps.insert({info.name, info});
@@ -1274,7 +1283,7 @@ struct FetchOpTranscriber : public OpTranscriber {
     OpAttributeInfoList attr_infos;
     OpOutputInfoList output_infos;
     std::tie(input_infos, attr_infos, output_infos, std::ignore, std::ignore) =
-        op_info_concept->get_op_info_();
+        op_info_concept->get_op_info_(op_info.name());
 
     this->InsertSliceOperationForInput(
         ctx, param_map, op_desc, input_infos, block);
diff --git a/paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.cc b/paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.cc
index ecf04d4411397..63e2a83a7dbe9 100644
--- a/paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.cc
+++ b/paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.cc
@@ -26,20 +26,7 @@ REGISTER_FILE_SYMBOLS(kernel_dialect);
 namespace paddle {
 namespace dialect {
 
-KernelDialect::KernelDialect(pir::IrContext *context)
-    : pir::Dialect(name(), context, pir::TypeId::get<KernelDialect>()) {
-  initialize();
-}
-
-void KernelDialect::initialize() {
-  RegisterTypes<paddle::dialect::AllocatedDenseTensorType,
-                paddle::dialect::AllocatedSelectedRowsType,
-                paddle::dialect::AllocatedDenseTensorArrayType>();
-  RegisterOps<dialect::PhiKernelOp, dialect::LegacyKernelOp>();
-  RegisterAttributes<paddle::dialect::KernelAttribute>();
-}
-
-void KernelDialect::PrintType(pir::Type type, std::ostream &os) const {
+void PrintKernelType(pir::Type type, std::ostream &os) {
   if (type.isa<AllocatedDenseTensorType>()) {
     AllocatedDenseTensorType tensor_type =
         type.dyn_cast<AllocatedDenseTensorType>();
@@ -75,14 +62,35 @@ void KernelDialect::PrintType(pir::Type type, std::ostream &os) const {
   }
 }
 
-void KernelDialect::PrintAttribute(pir::Attribute attr,
-                                   std::ostream &os) const {
+void PrintKernelAttribute(pir::Attribute attr, std::ostream &os) {
   phi::KernelKey kernel = attr.dyn_cast<KernelAttribute>().data();
 
   os << "<backend:" << kernel.backend() << "|layout:" << kernel.layout()
      << "|dtype:" << kernel.dtype() << ">";
 }
 
+KernelDialect::KernelDialect(pir::IrContext *context)
+    : pir::Dialect(name(), context, pir::TypeId::get<KernelDialect>()) {
+  initialize();
+}
+
+void KernelDialect::initialize() {
+  RegisterTypes<paddle::dialect::AllocatedDenseTensorType,
+                paddle::dialect::AllocatedSelectedRowsType,
+                paddle::dialect::AllocatedDenseTensorArrayType>();
+  RegisterOps<dialect::PhiKernelOp, dialect::LegacyKernelOp>();
+  RegisterAttributes<paddle::dialect::KernelAttribute>();
+}
+
+void KernelDialect::PrintType(pir::Type type, std::ostream &os) const {
+  PrintKernelType(type, os);
+}
+
+void KernelDialect::PrintAttribute(pir::Attribute attr,
+                                   std::ostream &os) const {
+  PrintKernelAttribute(attr, os);
+}
+
 void KernelDialect::PrintOperation(pir::Operation *op,
                                    pir::IrPrinter &printer) const {
   if (op->dyn_cast<PhiKernelOp>() || op->dyn_cast<LegacyKernelOp>()) {
@@ -122,6 +130,45 @@ void KernelDialect::PrintOperation(pir::Operation *op,
   }
 }
 
+CustomKernelDialect::CustomKernelDialect(pir::IrContext *context)
+    : pir::Dialect(name(), context, pir::TypeId::get<CustomKernelDialect>()) {
+  initialize();
+}
+
+void CustomKernelDialect::initialize() {
+  RegisterTypes<paddle::dialect::AllocatedDenseTensorType>();
+  RegisterOps<dialect::CustomKernelOp>();
+  RegisterAttributes<paddle::dialect::KernelAttribute>();
+}
+
+void CustomKernelDialect::PrintType(pir::Type type, std::ostream &os) const {
+  PrintKernelType(type, os);
+}
+
+void CustomKernelDialect::PrintAttribute(pir::Attribute attr,
+                                         std::ostream &os) const {
+  PrintKernelAttribute(attr, os);
+}
+
+void CustomKernelDialect::PrintOperation(pir::Operation *op,
+                                         pir::IrPrinter &printer) const {
+  auto &os = printer.os;
+  printer.PrintOpResult(op);
+  os << " =";
+  auto custom_kernel_op = op->dyn_cast<CustomKernelOp>();
+  std::string kernel_name = custom_kernel_op.kernel_name();
+  if (op->attributes().count("is_inplace") != 0 &&
+      op->attributes().at("is_inplace").dyn_cast<pir::BoolAttribute>().data()) {
+    kernel_name = kernel_name + "_";
+  }
+  os << " \"" << kernel_name << "(custom_kernel)\"";
+  printer.PrintOpOperands(op);
+  printer.PrintAttributeMap(op);
+  os << " :";
+  printer.PrintOperandsType(op);
+  os << " -> ";
+  printer.PrintOpReturnType(op);
+}
 #ifdef PADDLE_WITH_DNNL
 OneDNNKernelDialect::OneDNNKernelDialect(pir::IrContext *context)
     : pir::Dialect(name(), context, pir::TypeId::get<OneDNNKernelDialect>()) {
@@ -139,47 +186,12 @@ void OneDNNKernelDialect::initialize() {
 }
 
 void OneDNNKernelDialect::PrintType(pir::Type type, std::ostream &os) const {
-  if (type.isa<AllocatedDenseTensorType>()) {
-    AllocatedDenseTensorType tensor_type =
-        type.dyn_cast<AllocatedDenseTensorType>();
-
-    os << phi::AllocationTypeStr(tensor_type.place().GetType()) << "_";
-    os << "tensor<";
-    for (auto d : common::vectorize(tensor_type.dims())) {
-      os << d;
-      os << "x";
-    }
-    tensor_type.dtype().Print(os);
-    os << ">";
-  } else if (type.isa<AllocatedSelectedRowsType>()) {
-    AllocatedSelectedRowsType tensor_type =
-        type.dyn_cast<AllocatedSelectedRowsType>();
-
-    os << phi::AllocationTypeStr(tensor_type.place().GetType()) << "_";
-    os << "tensor<";
-    for (auto d : common::vectorize(tensor_type.dims())) {
-      os << d;
-      os << "x";
-    }
-    tensor_type.dtype().Print(os);
-    os << ">";
-  } else if (type.isa<AllocatedDenseTensorArrayType>()) {
-    AllocatedDenseTensorArrayType tensor_array_type =
-        type.dyn_cast<AllocatedDenseTensorArrayType>();
-
-    os << phi::AllocationTypeStr(tensor_array_type.place().GetType()) << "_";
-    os << "tensor_array<";
-    tensor_array_type.dtype().Print(os);
-    os << ">";
-  }
+  PrintKernelType(type, os);
 }
 
 void OneDNNKernelDialect::PrintAttribute(pir::Attribute attr,
                                          std::ostream &os) const {
-  phi::KernelKey kernel = attr.dyn_cast<KernelAttribute>().data();
-
-  os << "<backend:" << kernel.backend() << "|layout:" << kernel.layout()
-     << "|dtype:" << kernel.dtype() << ">";
+  PrintKernelAttribute(attr, os);
 }
 
 void OneDNNKernelDialect::PrintOperation(pir::Operation *op,
@@ -226,6 +238,7 @@ void OneDNNKernelDialect::PrintOperation(pir::Operation *op,
 }  // namespace paddle
 
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::KernelDialect)
+IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::CustomKernelDialect)
 #ifdef PADDLE_WITH_DNNL
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::OneDNNKernelDialect)
 #endif
diff --git a/paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.h b/paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.h
index fbdb53a40b183..ad198cb25296d 100644
--- a/paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.h
+++ b/paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.h
@@ -36,6 +36,23 @@ class KernelDialect : public pir::Dialect {
   void initialize();
 };
 
+class CustomKernelDialect : public pir::Dialect {
+ public:
+  explicit CustomKernelDialect(pir::IrContext* context);
+
+  static const char* name() { return "custom_kernel"; }
+
+  void PrintType(pir::Type type, std::ostream& os) const override;
+
+  void PrintAttribute(pir::Attribute attr, std::ostream& os) const override;
+
+  void PrintOperation(pir::Operation* op,
+                      pir::IrPrinter& printer) const override;  // NOLINT
+
+ private:
+  void initialize();
+};
+
 #ifdef PADDLE_WITH_DNNL
 class OneDNNKernelDialect : public pir::Dialect {
  public:
@@ -59,6 +76,7 @@ class OneDNNKernelDialect : public pir::Dialect {
 }  // namespace paddle
 
 IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::KernelDialect)
+IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::CustomKernelDialect)
 #ifdef PADDLE_WITH_DNNL
 IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::OneDNNKernelDialect)
 #endif
diff --git a/paddle/fluid/pir/dialect/kernel/ir/kernel_op.cc b/paddle/fluid/pir/dialect/kernel/ir/kernel_op.cc
index 45f0a848fc174..c5095046ff8ae 100644
--- a/paddle/fluid/pir/dialect/kernel/ir/kernel_op.cc
+++ b/paddle/fluid/pir/dialect/kernel/ir/kernel_op.cc
@@ -98,6 +98,46 @@ phi::KernelKey LegacyKernelOp::kernel_key() {
   return attributes().at("kernel_key").dyn_cast<KernelAttribute>().data();
 }
 
+const char* CustomKernelOp::attributes_name[attributes_num] = {  // NOLINT
+    "op_name",
+    "kernel_name",
+    "kernel_key"};
+
+void CustomKernelOp::VerifySig() {
+  VLOG(4) << "Verifying inputs, outputs and attributes for: CustomKernelOp.";
+  auto& attributes = this->attributes();
+
+  PADDLE_ENFORCE(attributes.count("op_name") > 0 &&
+                     attributes.at("op_name").isa<pir::StrAttribute>(),
+                 phi::errors::PreconditionNotMet(
+                     "Type of attribute: op_name is not right."));
+
+  PADDLE_ENFORCE(attributes.count("kernel_name") > 0 &&
+                     attributes.at("kernel_name").isa<pir::StrAttribute>(),
+                 phi::errors::PreconditionNotMet(
+                     "Type of attribute: kernel_name is not right."));
+
+  PADDLE_ENFORCE(attributes.count("kernel_key") > 0 &&
+                     attributes.at("kernel_key").isa<KernelAttribute>(),
+                 phi::errors::PreconditionNotMet(
+                     "Type of attribute: kernel_key is not right."));
+}
+
+std::string CustomKernelOp::op_name() {
+  return attributes().at("op_name").dyn_cast<pir::StrAttribute>().AsString();
+}
+
+std::string CustomKernelOp::kernel_name() {
+  return attributes()
+      .at("kernel_name")
+      .dyn_cast<pir::StrAttribute>()
+      .AsString();
+}
+
+phi::KernelKey CustomKernelOp::kernel_key() {
+  return attributes().at("kernel_key").dyn_cast<KernelAttribute>().data();
+}
+
 #ifdef PADDLE_WITH_DNNL
 const char* OneDNNPhiKernelOp::attributes_name[attributes_num] = {  // NOLINT
     "op_name",
@@ -134,6 +174,7 @@ std::string OneDNNPhiKernelOp::kernel_name() {
       .dyn_cast<pir::StrAttribute>()
       .AsString();
 }
+
 phi::KernelKey OneDNNPhiKernelOp::kernel_key() {
   return attributes().at("kernel_key").dyn_cast<KernelAttribute>().data();
 }
@@ -225,6 +266,7 @@ phi::KernelKey OneDNNLegacyKernelOp::kernel_key() {
 
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::PhiKernelOp)
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::LegacyKernelOp)
+IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::CustomKernelOp)
 #ifdef PADDLE_WITH_DNNL
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::OneDNNPhiKernelOp)
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::OneDNNMixedPhiKernelOp)
diff --git a/paddle/fluid/pir/dialect/kernel/ir/kernel_op.h b/paddle/fluid/pir/dialect/kernel/ir/kernel_op.h
index df72315870208..0fcaeb2080742 100644
--- a/paddle/fluid/pir/dialect/kernel/ir/kernel_op.h
+++ b/paddle/fluid/pir/dialect/kernel/ir/kernel_op.h
@@ -44,6 +44,18 @@ class LegacyKernelOp : public pir::Op<LegacyKernelOp> {
   void VerifySig();
 };
 
+class CustomKernelOp : public pir::Op<CustomKernelOp> {
+ public:
+  using Op::Op;
+  static const char *name() { return "custom_kernel"; }
+  static constexpr uint32_t attributes_num = 3;
+  static const char *attributes_name[attributes_num];
+  std::string op_name();
+  std::string kernel_name();
+  phi::KernelKey kernel_key();
+  void VerifySig();
+};
+
 #ifdef PADDLE_WITH_DNNL
 class OneDNNPhiKernelOp : public pir::Op<OneDNNPhiKernelOp> {
  public:
@@ -87,6 +99,7 @@ class OneDNNLegacyKernelOp : public pir::Op<OneDNNLegacyKernelOp> {
 
 IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::PhiKernelOp)
 IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::LegacyKernelOp)
+IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::CustomKernelOp)
 #ifdef PADDLE_WITH_DNNL
 IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::OneDNNPhiKernelOp)
 IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::OneDNNMixedPhiKernelOp)
diff --git a/paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h b/paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h
index 24a7622fa99b0..0f045cb97a0ec 100644
--- a/paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h
+++ b/paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h
@@ -28,26 +28,28 @@ namespace dialect {
 class OpYamlInfoInterface : public pir::OpInterfaceBase<OpYamlInfoInterface> {
  public:
   struct Concept {
-    explicit Concept(OpInfoTuple (*get_op_info)())
+    explicit Concept(OpInfoTuple (*get_op_info)(const std::string& op_name))
         : get_op_info_(get_op_info) {}
-    OpInfoTuple (*get_op_info_)();
+    OpInfoTuple (*get_op_info_)(const std::string& op_name);
   };
 
   template <class ConcreteOp>
   struct Model : public Concept {
-    static OpInfoTuple GetOpInfo() { return ConcreteOp::GetOpInfo(); }
+    static OpInfoTuple GetOpInfo(const std::string& op_name) {
+      return ConcreteOp::GetOpInfo();
+    }
 
     Model() : Concept(GetOpInfo) {}
   };
 
   /// Constructor
-  OpYamlInfoInterface(pir::Operation *op, Concept *impl)
+  OpYamlInfoInterface(pir::Operation* op, Concept* impl)
       : pir::OpInterfaceBase<OpYamlInfoInterface>(op), impl_(impl) {}
 
-  OpInfoTuple GetOpInfo() { return impl_->get_op_info_(); }
+  OpInfoTuple GetOpInfo() { return impl_->get_op_info_(operation_->name()); }
 
  private:
-  Concept *impl_;
+  Concept* impl_;
 };
 
 }  // namespace dialect
diff --git a/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc b/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc
index 6e2e105d9c18a..80f6e598f967c 100644
--- a/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/framework/custom_operator_utils.h"
 #include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
@@ -29,10 +30,20 @@
 namespace paddle {
 namespace dialect {
 
+static std::unordered_map<std::string, std::string> kCustomTypeMap = {
+    {"bool", "pir::BoolAttribute"},
+    {"int", "pir::Int32Attribute"},
+    {"float", "pir::FloatAttribute"},
+    {"int64_t", "pir::Int64Attribute"},
+    {"std::string", "pir::StrAttribute"},
+    {"std::vector<int>", "pir::ArrayAttribute<pir::Int32Attribute>"},
+    {"std::vector<float>", "pir::ArrayAttribute<pir::FloatAttribute>"},
+    {"std::vector<int64_t>", "pir::ArrayAttribute<pir::Int64Attribute>"},
+    {"std::vector<std::string>", "pir::ArrayAttribute<pir::StrAttribute>"}};
 struct CombineOpInferSymbolicShapeInterfaceModel
     : public InferSymbolicShapeInterface::Concept {
   static inline bool InferSymbolicShape(
-      pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+      pir::Operation* op, pir::ShapeConstraintIRAnalysis* shape_analysis) {
     symbol::ShapeOrDataDimExprs value_shape;
 
     // for (auto operand_source : op->operands_source()) {
@@ -55,7 +66,7 @@ struct CombineOpInferSymbolicShapeInterfaceModel
       : InferSymbolicShapeInterface::Concept(InferSymbolicShape) {}
 };
 
-OperatorDialect::OperatorDialect(pir::IrContext *ctx)
+OperatorDialect::OperatorDialect(pir::IrContext* ctx)
     : pir::Dialect(name(), ctx, pir::TypeId::get<OperatorDialect>()) {
   initialize();
   ctx->GetOrRegisterDialect<::pir::ControlFlowDialect>();
@@ -69,40 +80,7 @@ OperatorDialect::OperatorDialect(pir::IrContext *ctx)
                                CombineOpInferSymbolicShapeInterfaceModel>()));
 }
 
-void OperatorDialect::initialize() {
-  RegisterTypes<paddle::dialect::DenseTensorType,
-                paddle::dialect::SelectedRowsType,
-                paddle::dialect::DenseTensorArrayType>();
-
-  RegisterAttributes<paddle::dialect::IntArrayAttribute,
-                     paddle::dialect::DataTypeAttribute,
-                     paddle::dialect::PlaceAttribute,
-                     paddle::dialect::DataLayoutAttribute>();
-
-  // NOTE(zhangbo9674): GET_OP_LIST is defined in pd_op.h which is
-  // generated by op_gen.py, see details in
-  // paddle/fluid/pir/dialect/CMakeLists.txt.
-  // NOTE(Ruting)GET_MANUAL_OP_LIST is define in manual_op.h"
-  // use RegisterOps when list has more than two ops.
-  RegisterOps<
-#define GET_OP_LIST
-#include "paddle/fluid/pir/dialect/operator/ir/pd_op_info.cc"  // NOLINT
-      >();
-
-  RegisterOps<
-#define GET_OP_LIST
-#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc"  // NOLINT
-      >();
-
-  RegisterOps<
-#define GET_OP_LIST
-#include "paddle/fluid/pir/dialect/operator/ir/manual_op.cc"  // NOLINT
-      >();
-
-  RegisterInterfaces<ParameterConvertInterface>();
-}
-
-void OperatorDialect::PrintType(pir::Type type, std::ostream &os) const {
+void PrintTypeImpl(pir::Type type, std::ostream& os) {
   os << type.dialect().name();
   os << '.';
   if (auto tensor_type = type.dyn_cast<DenseTensorType>()) {
@@ -127,16 +105,14 @@ void OperatorDialect::PrintType(pir::Type type, std::ostream &os) const {
     os << ">";
   }
 }
-
-void OperatorDialect::PrintAttribute(pir::Attribute attr,
-                                     std::ostream &os) const {
+void PrintAttributeImpl(pir::Attribute attr, std::ostream& os) {
   os << "(" << attr.dialect().name();
   os << '.';
   if (auto int_array_attr = attr.dyn_cast<IntArrayAttribute>()) {
     phi::IntArray data = int_array_attr.data();
     os << "IntArray)"
        << "[";
-    const auto &inner_data = data.GetData();
+    const auto& inner_data = data.GetData();
     pir::PrintInterleave(
         inner_data.begin(),
         inner_data.end(),
@@ -154,7 +130,60 @@ void OperatorDialect::PrintAttribute(pir::Attribute attr,
   }
 }
 
-pir::Type OperatorDialect::ParseType(pir::IrParser &parser) {  // NOLINT
+void PrintOperationImpl(pir::Operation* op,
+                        pir::IrPrinter& printer) {  // NOLINT
+  if (auto if_op = op->dyn_cast<IfOp>()) {
+    if_op.Print(printer);
+  } else if (auto while_op = op->dyn_cast<WhileOp>()) {
+    while_op.Print(printer);
+  } else {
+    printer.PrintGeneralOperation(op);
+  }
+}
+
+void OperatorDialect::initialize() {
+  RegisterTypes<paddle::dialect::DenseTensorType,
+                paddle::dialect::SelectedRowsType,
+                paddle::dialect::DenseTensorArrayType>();
+
+  RegisterAttributes<paddle::dialect::IntArrayAttribute,
+                     paddle::dialect::DataTypeAttribute,
+                     paddle::dialect::PlaceAttribute,
+                     paddle::dialect::DataLayoutAttribute>();
+
+  // NOTE(zhangbo9674): GET_OP_LIST is defined in pd_op.h which is
+  // generated by op_gen.py, see details in
+  // paddle/fluid/pir/dialect/CMakeLists.txt.
+  // NOTE(Ruting)GET_MANUAL_OP_LIST is define in manual_op.h"
+  // use RegisterOps when list has more than two ops.
+  RegisterOps<
+#define GET_OP_LIST
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op_info.cc"  // NOLINT
+      >();
+
+  RegisterOps<
+#define GET_OP_LIST
+#include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc"  // NOLINT
+      >();
+
+  RegisterOps<
+#define GET_OP_LIST
+#include "paddle/fluid/pir/dialect/operator/ir/manual_op.cc"  // NOLINT
+      >();
+
+  RegisterInterfaces<ParameterConvertInterface>();
+}
+
+void OperatorDialect::PrintType(pir::Type type, std::ostream& os) const {
+  PrintTypeImpl(type, os);
+}
+
+void OperatorDialect::PrintAttribute(pir::Attribute attr,
+                                     std::ostream& os) const {
+  PrintAttributeImpl(attr, os);
+}
+
+pir::Type OperatorDialect::ParseType(pir::IrParser& parser) {  // NOLINT
   parser.ConsumeAToken("pd_op.tensor");
   parser.ConsumeAToken("<");
   std::vector<int> dim{};
@@ -184,7 +213,7 @@ pir::Type OperatorDialect::ParseType(pir::IrParser &parser) {  // NOLINT
 }
 
 pir::Attribute OperatorDialect::ParseAttribute(
-    pir::IrParser &parser) {  // NOLINT
+    pir::IrParser& parser) {  // NOLINT
   std::string type_name = parser.ConsumeToken().val_;
   std::string attribute_name =
       type_name.substr(type_name.find('.') + 1, std::string::npos);
@@ -203,18 +232,195 @@ pir::Attribute OperatorDialect::ParseAttribute(
   }
 }
 
-void OperatorDialect::PrintOperation(pir::Operation *op,
-                                     pir::IrPrinter &printer) const {
-  if (auto if_op = op->dyn_cast<IfOp>()) {
-    if_op.Print(printer);
-  } else if (auto while_op = op->dyn_cast<WhileOp>()) {
-    while_op.Print(printer);
-  } else {
-    printer.PrintGeneralOperation(op);
+void OperatorDialect::PrintOperation(pir::Operation* op,
+                                     pir::IrPrinter& printer) const {
+  PrintOperationImpl(op, printer);
+}
+
+class IdManager {
+ public:
+  static IdManager& Instance() {
+    static IdManager instance;
+    return instance;
+  }
+
+  ~IdManager() {
+    for (auto id : ids_) {
+      delete id;
+    }
+    ids_.clear();
+  }
+
+  pir::TypeId CreateId() {
+    pir::detail::UniqueingId* unique_id = new pir::detail::UniqueingId();
+    ids_.push_back(unique_id);
+    return ids_.back()->id();
   }
+
+ private:
+  std::vector<pir::detail::UniqueingId*> ids_;
+};
+
+class AttributeManager {
+ public:
+  static AttributeManager& Instance() {
+    static AttributeManager instance;
+    return instance;
+  }
+
+  ~AttributeManager() {
+    for (size_t i = 0; i < char_pointers_.size(); i++) {
+      for (size_t j = 0; j < pointers_size_[i]; j++) {
+        delete char_pointers_[i][j];
+      }
+      delete char_pointers_[i];
+    }
+    char_pointers_.clear();
+    pointers_size_.clear();
+  }
+
+  const char** ToCharPointers(const std::vector<std::string>& attr_names) {
+    const char** char_pointers = new const char*[attr_names.size()];
+    for (size_t i = 0; i < attr_names.size(); i++) {
+      const std::string& attr_name = attr_names[i];
+      char* ptr = new char[attr_name.size() + 1];
+      snprintf(ptr, attr_name.size() + 1, "%s", attr_name.c_str());
+      char_pointers[i] = ptr;
+    }
+    pointers_size_.push_back(attr_names.size());
+    char_pointers_.push_back(char_pointers);
+    return char_pointers;
+  }
+
+ private:
+  std::vector<const char**> char_pointers_;
+  std::vector<uint32_t> pointers_size_;
+};
+
+struct CustomOpInfoInterfaceModel : public OpYamlInfoInterface::Concept {
+  static OpInfoTuple GetPirOpInfo(const std::string& pir_op_name) {
+    const auto& op_meta =
+        paddle::framework::detail::GetOpInfoByPirName(pir_op_name);
+    std::vector<paddle::dialect::OpInputInfo> inputs_info;
+    std::vector<paddle::dialect::OpAttributeInfo> attributes_info;
+    std::vector<paddle::dialect::OpOutputInfo> outputs_info;
+    std::vector<std::string> param_names;
+    // translate input info
+    auto& op_input_names = OpMetaInfoHelper::GetInputs(op_meta);
+    for (const auto& input_name : op_input_names) {
+      param_names.push_back(input_name);
+      bool is_optional = false;
+      std::string input_type = "paddle::dialect::DenseTensorType";
+      if (paddle::framework::detail::IsOptionalVar(input_name)) {
+        is_optional = true;
+      }
+      if (paddle::framework::detail::IsDuplicableVar(input_name)) {
+        input_type = "pir::VectorType<paddle::dialect::DenseTensorType>";
+      }
+      // Now, we only support dense tensor as input.
+      inputs_info.push_back(paddle::dialect::OpInputInfo{
+          input_name, input_type, is_optional, false, false, false});
+    }
+
+    // translate attr info
+    auto& op_attrs = OpMetaInfoHelper::GetAttrs(op_meta);
+    for (const auto& op_attr : op_attrs) {
+      auto attr_name_and_type = paddle::ParseAttrStr(op_attr);
+      auto attr_name = attr_name_and_type[0];
+      auto attr_type_str = attr_name_and_type[1];
+      param_names.push_back(attr_name);
+      if (kCustomTypeMap.find(attr_type_str) == kCustomTypeMap.end()) {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported `%s` type value as custom attribute now. "
+            "Supported data types include `bool`, `int`, `float`, "
+            "`int64_t`, `std::string`, `std::vector<int>`, "
+            "`std::vector<float>`, `std::vector<int64_t>`, "
+            "`std::vector<std::string>`, Please check whether "
+            "the attribute data type and data type string are matched.",
+            attr_type_str));
+      }
+      std::string attr_pir_type = kCustomTypeMap[attr_type_str];
+      attributes_info.push_back(
+          paddle::dialect::OpAttributeInfo{attr_name, attr_pir_type, ""});
+    }
+
+    // translate output info
+    auto& op_output_names = OpMetaInfoHelper::GetOutputs(op_meta);
+    for (const auto& output_name : op_output_names) {
+      bool is_optional = false;
+      if (paddle::framework::detail::IsOptionalVar(output_name)) {
+        is_optional = true;
+      }
+      // Now, we only support dense tensor as output.
+      outputs_info.push_back(paddle::dialect::OpOutputInfo{
+          output_name, "paddle::dialect::DenseTensorType", is_optional, false});
+    }
+
+    // we only need kernel params name in run_time_info
+    paddle::dialect::OpRunTimeInfo run_time_info =
+        paddle::dialect::OpRunTimeInfo("", {}, "", param_names, {}, {}, {}, {});
+    return std::make_tuple(
+        inputs_info, attributes_info, outputs_info, run_time_info, "");
+  }
+
+  CustomOpInfoInterfaceModel() : OpYamlInfoInterface::Concept(GetPirOpInfo) {}
+};
+
+CustomOpDialect::CustomOpDialect(pir::IrContext* context)
+    : pir::Dialect(name(), context, pir::TypeId::get<CustomOpDialect>()) {}
+
+void CustomOpDialect::PrintType(pir::Type type, std::ostream& os) const {
+  PrintTypeImpl(type, os);
 }
 
+void CustomOpDialect::PrintAttribute(pir::Attribute attr,
+                                     std::ostream& os) const {
+  PrintAttributeImpl(attr, os);
+}
+
+void CustomOpDialect::PrintOperation(pir::Operation* op,
+                                     pir::IrPrinter& printer) const {
+  PrintOperationImpl(op, printer);
+}
+
+void CustomOpDialect::RegisterCustomOp(const paddle::OpMetaInfo& op_meta) {
+  pir::TypeId id = IdManager::Instance().CreateId();
+  std::string op_name = paddle::framework::kCustomDialectPrefix +
+                        OpMetaInfoHelper::GetOpName(op_meta);
+  op_names_.push_back(op_name);
+
+  auto& op_attrs = OpMetaInfoHelper::GetAttrs(op_meta);
+  std::vector<std::string> attr_names;
+  for (const auto& op_attr : op_attrs) {
+    auto attr_name_and_type = paddle::ParseAttrStr(op_attr);
+    auto attr_name = attr_name_and_type[0];
+    attr_names.push_back(attr_name);
+  }
+  const char** attr_name =
+      AttributeManager::Instance().ToCharPointers(attr_names);
+  uint32_t attr_num = attr_names.size();
+
+  std::vector<pir::TypeId> traits;
+  std::set<pir::InterfaceValue> interface_values;
+  pir::InterfaceValue op_info_interface =
+      pir::InterfaceValue::Get<OpYamlInfoInterface,
+                               CustomOpInfoInterfaceModel>();
+  interface_values.insert(std::move(op_info_interface));
+  // Currently we set empty verify function and will reset it if it is used in
+  // future.
+  pir::VerifyPtr verify_func = [](pir::Operation* op) {};
+  ir_context()->RegisterOpInfo(this,
+                               id,
+                               op_names_.back().c_str(),
+                               std::move(interface_values),
+                               traits,
+                               attr_num,
+                               attr_name,
+                               verify_func,
+                               verify_func);
+}
 }  // namespace dialect
 }  // namespace paddle
 
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::OperatorDialect)
+IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::CustomOpDialect)
diff --git a/paddle/fluid/pir/dialect/operator/ir/op_dialect.h b/paddle/fluid/pir/dialect/operator/ir/op_dialect.h
index 8a61f6cb9615b..d6626f999ffd1 100644
--- a/paddle/fluid/pir/dialect/operator/ir/op_dialect.h
+++ b/paddle/fluid/pir/dialect/operator/ir/op_dialect.h
@@ -14,7 +14,10 @@
 
 #pragma once
 
+#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
+#include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/pir/core/dialect.h"
+#include "paddle/pir/core/operation.h"
 #include "paddle/utils/test_macros.h"
 
 namespace paddle {
@@ -39,7 +42,39 @@ class TEST_API OperatorDialect : public pir::Dialect {
   void initialize();
 };
 
+inline bool IsCustomOp(pir::Operation* op) {
+  std::string op_name = op->name();
+  return op_name.find("custom_op") != op_name.npos;
+}
+
+class CustomOpDialect : public pir::Dialect {
+ public:
+  explicit CustomOpDialect(pir::IrContext* context);
+
+  static const char* name() { return "custom_op"; }
+
+  void PrintType(pir::Type type, std::ostream& os) const override;
+  void PrintAttribute(pir::Attribute type, std::ostream& os) const override;
+
+  void PrintOperation(pir::Operation* op,
+                      pir::IrPrinter& printer) const override;  // NOLINT
+
+  void RegisterCustomOp(const paddle::OpMetaInfo& op_meta);
+
+  bool HasRegistered(const std::string& op_name) {
+    if (std::find(op_names_.begin(), op_names_.end(), op_name) !=
+        op_names_.end()) {
+      return true;
+    }
+    return false;
+  }
+
+ private:
+  std::vector<std::string> op_names_;
+};
+
 }  // namespace dialect
 }  // namespace paddle
 
 IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::OperatorDialect)
+IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::CustomOpDialect)
diff --git a/paddle/fluid/pir/transforms/inplace_pass.cc b/paddle/fluid/pir/transforms/inplace_pass.cc
index b836617321f8c..56d767180c15a 100644
--- a/paddle/fluid/pir/transforms/inplace_pass.cc
+++ b/paddle/fluid/pir/transforms/inplace_pass.cc
@@ -181,7 +181,8 @@ bool IsNoNeedBuffer(pir::Operation* op, pir::Value value) {
         op_info.GetInterfaceImpl<paddle::dialect::OpYamlInfoInterface>();
     if (info_interface) {
       paddle::dialect::OpYamlInfoParser info_parser(
-          info_interface->get_op_info_(), paddle::dialect::IsLegacyOp(op_name));
+          info_interface->get_op_info_(op_name),
+          paddle::dialect::IsLegacyOp(op_name));
       auto& no_need_buffer_ids = info_parser.NoNeedBufferIds();
       for (size_t id = 0; id < no_need_buffer_ids.size(); id++) {
         if (value == op->operand_source(no_need_buffer_ids[id])) {
@@ -274,23 +275,19 @@ void GetEagerDelValueOfOp(
 std::unordered_map<pir::Operation*, std::unordered_set<pir::Value>>
 GetEagerDeletionValues(const pir::Block& block) {
   std::unordered_set<pir::Value> skip_dels = GetSkipDeletionValues(block);
-
   std::unordered_map<pir::Value, pir::Operation*> del_value_2_op;
   GetEagerDelValueOfOp(block, skip_dels, &del_value_2_op);
-
   std::unordered_map<pir::Operation*, std::unordered_set<pir::Value>>
       eager_dels;
   for (auto& kv : del_value_2_op) {
     eager_dels[kv.second].insert(kv.first);
   }
-
   return eager_dels;
 }
 
 std::unordered_map<pir::Operation*, std::string> GetInplaceOps(
     const pir::Block& block) {
   const auto eager_dels = GetEagerDeletionValues(block);
-
   std::unordered_map<pir::Operation*, std::string> inplace_ops;
 
   std::unordered_set<pir::Value> visited_values;
@@ -312,7 +309,6 @@ std::unordered_map<pir::Operation*, std::string> GetInplaceOps(
       }
       continue;
     }
-
     auto upper_op_attrs = op.attributes();
     auto upper_op_name =
         upper_op_attrs.at("op_name").dyn_cast<pir::StrAttribute>().AsString();
@@ -389,7 +385,7 @@ std::unordered_map<pir::Operation*, std::string> GetInplaceOps(
         phi::errors::PreconditionNotMet(
             "can not find OpYamlInfoInterface from [%s]", upper_op_name + "_"));
     paddle::dialect::OpYamlInfoParser upper_inplace_op_info_parser(
-        upper_inplace_op_interface->get_op_info_());
+        upper_inplace_op_interface->get_op_info_(upper_op_name + "_"));
     std::unordered_map<uint32_t, uint32_t> inplace_out_2_in =
         upper_inplace_op_info_parser.GetInplaceIdMap();
 
diff --git a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
index df7b8673d9ea8..165a1d3fde4fc 100644
--- a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
+++ b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
@@ -1475,6 +1475,178 @@ void HandleForSpecialOp(
   VLOG(6) << "Deep copy a new special op: " << op_item->name();
 }
 
+void PushBackOutputTypes(pir::IrContext* ctx,
+                         pir::Operation* op_item,
+                         const phi::Place& out_place,
+                         const phi::KernelKey& kernel_key,
+                         std::vector<pir::Type>* op_output_types,
+                         size_t index) {
+  auto result_type = op_item->result(index).type();
+  if (!result_type) {
+    op_output_types->push_back(result_type);
+  } else if (result_type.isa<DenseTensorType>() ||
+             result_type.isa<SelectedRowsType>() ||
+             result_type.isa<DenseTensorArrayType>()) {
+#ifdef PADDLE_WITH_DNNL
+    if (kernel_key.backend() == phi::Backend::ONEDNN) {
+      op_output_types->push_back(BuildOutputType(
+          result_type, out_place, phi::DataLayout::ONEDNN, ctx));
+    } else {
+      op_output_types->push_back(BuildOutputType(result_type, out_place, ctx));
+    }
+#else
+    op_output_types->push_back(BuildOutputType(result_type, out_place, ctx));
+#endif
+
+  } else if (result_type.isa<pir::VectorType>()) {
+    std::vector<pir::Type> vec_inner_types;
+    auto base_types = result_type.dyn_cast<pir::VectorType>().data();
+    for (auto& base_type : base_types) {
+      if (base_type) {
+        if (base_type.isa<DenseTensorType>() ||
+            base_type.isa<SelectedRowsType>()) {
+#ifdef PADDLE_WITH_DNNL
+          if (kernel_key.backend() == phi::Backend::ONEDNN) {
+            vec_inner_types.push_back(BuildOutputType(
+                base_type, out_place, phi::DataLayout::ONEDNN, ctx));
+          } else {
+            vec_inner_types.push_back(
+                BuildOutputType(base_type, out_place, ctx));
+          }
+#else
+          vec_inner_types.push_back(BuildOutputType(base_type, out_place, ctx));
+#endif
+        } else {
+          PADDLE_THROW(phi::errors::Unimplemented(
+              "only support dense tensor and selected rows in vector type "
+              "for now"));
+        }
+      } else {
+        // NOTE(phlrain), kernel not support a nullptr in output
+        pir::Type fp32_dtype = pir::Float32Type::get(ctx);
+        phi::DDim dims = {};
+        phi::DataLayout data_layout = phi::DataLayout::NCHW;
+#ifdef PADDLE_WITH_DNNL
+        if (kernel_key.backend() == phi::Backend::ONEDNN) {
+          data_layout = phi::DataLayout::ONEDNN;
+        }
+#endif
+        phi::LoD lod = {{}};
+        size_t offset = 0;
+        auto dense_tensor_dtype = DenseTensorType::get(
+            ctx, fp32_dtype, dims, data_layout, lod, offset);
+        auto allocated_dense_tensor_dtype =
+            AllocatedDenseTensorType::get(ctx, out_place, dense_tensor_dtype);
+        vec_inner_types.push_back(allocated_dense_tensor_dtype);
+      }
+    }
+
+    pir::Type t1 = pir::VectorType::get(ctx, vec_inner_types);
+    op_output_types->push_back(t1);
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "Result type only support DenseTensorType, SelectedRowType and "
+        "VectorType"));
+  }
+}
+
+void HandleForCustomOp(
+    pir::IrContext* ctx,
+    pir::Operation* op_item,
+    const phi::KernelKey& kernel_key,
+    const phi::Place place,
+    const OpYamlInfoParser* op_info_parser,
+    std::unordered_map<pir::Operation*, pir::Operation*>* map_op_pair,
+    std::unordered_map<pir::Value, pir::Value>* map_value_pair,
+    pir::Block* block) {
+  // Prepare output types
+  std::vector<pir::Type> op_output_types;
+
+  for (size_t i = 0; i < op_item->num_results(); ++i) {
+    phi::Place out_place = phi::TransToPhiPlace(kernel_key.backend());
+    PushBackOutputTypes(
+        ctx, op_item, out_place, kernel_key, &op_output_types, i);
+  }
+
+  // Prepare input
+  std::vector<pir::Value> vec_inputs;
+
+  for (size_t i = 0; i < op_item->num_operands(); ++i) {
+    auto cur_in = op_item->operand_source(i);
+    if (!cur_in) {
+      vec_inputs.emplace_back();
+      continue;
+    }
+    PADDLE_ENFORCE_EQ(
+        map_value_pair->count(cur_in),
+        true,
+        phi::errors::PreconditionNotMet(
+            "[%d]'s input of [%s] op MUST in map pair", i, op_item->name()));
+
+    auto new_in = map_value_pair->at(cur_in);
+    auto new_in_type = new_in.type();
+
+    if (new_in_type.isa<AllocatedDenseTensorType>()) {
+      auto in_place = new_in_type.dyn_cast<AllocatedDenseTensorType>().place();
+      // GPU_PINNED -> GPU, refer to PR#41972
+      if (phi::AllocationType::GPUPINNED == place.GetType()) {
+        VLOG(6) << "need trans from GPUPINNED to GPU";
+        // build memcopy op
+        auto out_place = phi::TransToPhiPlace(phi::Backend::GPU);
+        auto new_in_alloc_type =
+            new_in_type.dyn_cast<AllocatedDenseTensorType>();
+        auto out_type =
+            AllocatedDenseTensorType::get(ctx,
+                                          out_place,
+                                          new_in_alloc_type.dtype(),
+                                          new_in_alloc_type.dims(),
+                                          new_in_alloc_type.data_layout(),
+                                          new_in_alloc_type.lod(),
+                                          new_in_alloc_type.offset());
+        new_in = AddPlaceTransferOp(
+            new_in, out_type, in_place, out_place, kernel_key, block);
+      }
+    }
+
+    vec_inputs.push_back(new_in);
+  }
+
+  // Prepare attr
+  std::unordered_map<std::string, pir::Attribute> op_attribute{
+      {"op_name", pir::StrAttribute::get(ctx, op_item->name())},
+      {"kernel_name", pir::StrAttribute::get(ctx, op_item->name())},
+      {"kernel_key", KernelAttribute::get(ctx, kernel_key)}};
+  auto op_attr_map = op_item->attributes();
+
+  for (auto& map_item : op_attr_map) {
+    op_attribute.emplace(map_item.first, map_item.second);
+  }
+
+  if (op_item->HasTrait<InplaceTrait>()) {
+    op_attribute.emplace("is_inplace", pir::BoolAttribute::get(ctx, true));
+  }
+
+  VLOG(6) << "Lower custom op: " << op_item->name()
+          << " to : " << CustomKernelOp::name();
+
+  pir::OpInfo custom_kernel_op_info =
+      ctx->GetRegisteredOpInfo(CustomKernelOp::name());
+
+  pir::Operation* op = nullptr;
+  op = pir::Operation::Create(
+      vec_inputs, op_attribute, op_output_types, custom_kernel_op_info);
+
+  (*map_op_pair)[op_item] = op;
+
+  // only deal with single output
+  if (op_item->num_results() > 0) {
+    for (size_t i = 0; i < op_item->num_results(); ++i) {
+      (*map_value_pair)[op_item->result(i)] = op->result(i);
+    }
+  }
+  block->push_back(op);
+}
+
 std::vector<pir::Type> BuildOutputs(pir::Operation* op_item,
                                     const std::string& kernel_fn_str,
                                     const phi::KernelKey& kernel_key,
@@ -1508,75 +1680,8 @@ std::vector<pir::Type> BuildOutputs(pir::Operation* op_item,
         (!IsLegacyOp(op_item->name())) && phi_kernel.IsValid()) {
       out_place = phi::TransToPhiPlace(output_defs[i].backend);
     }
-
-    auto result_type = op_item->result(i).type();
-    if (!result_type) {
-      op_output_types.push_back(result_type);
-    } else if (result_type.isa<DenseTensorType>() ||
-               result_type.isa<SelectedRowsType>() ||
-               result_type.isa<DenseTensorArrayType>()) {
-#ifdef PADDLE_WITH_DNNL
-      if (kernel_key.backend() == phi::Backend::ONEDNN) {
-        op_output_types.push_back(BuildOutputType(
-            result_type, out_place, phi::DataLayout::ONEDNN, ctx));
-      } else {
-        op_output_types.push_back(BuildOutputType(result_type, out_place, ctx));
-      }
-#else
-      op_output_types.push_back(BuildOutputType(result_type, out_place, ctx));
-#endif
-
-    } else if (result_type.isa<pir::VectorType>()) {
-      std::vector<pir::Type> vec_inner_types;
-      auto base_types = result_type.dyn_cast<pir::VectorType>().data();
-      for (auto& base_type : base_types) {
-        if (base_type) {
-          if (base_type.isa<DenseTensorType>() ||
-              base_type.isa<SelectedRowsType>()) {
-#ifdef PADDLE_WITH_DNNL
-            if (kernel_key.backend() == phi::Backend::ONEDNN) {
-              vec_inner_types.push_back(BuildOutputType(
-                  base_type, out_place, phi::DataLayout::ONEDNN, ctx));
-            } else {
-              vec_inner_types.push_back(
-                  BuildOutputType(base_type, out_place, ctx));
-            }
-#else
-            vec_inner_types.push_back(
-                BuildOutputType(base_type, out_place, ctx));
-#endif
-          } else {
-            PADDLE_THROW(phi::errors::Unimplemented(
-                "only support dense tensor and selected rows in vector type "
-                "for now"));
-          }
-        } else {
-          // NOTE(phlrain), kernel not support a nullptr in output
-          pir::Type fp32_dtype = pir::Float32Type::get(ctx);
-          phi::DDim dims = {};
-          phi::DataLayout data_layout = phi::DataLayout::NCHW;
-#ifdef PADDLE_WITH_DNNL
-          if (kernel_key.backend() == phi::Backend::ONEDNN) {
-            data_layout = phi::DataLayout::ONEDNN;
-          }
-#endif
-          phi::LoD lod = {{}};
-          size_t offset = 0;
-          auto dense_tensor_dtype = DenseTensorType::get(
-              ctx, fp32_dtype, dims, data_layout, lod, offset);
-          auto allocated_dense_tensor_dtype =
-              AllocatedDenseTensorType::get(ctx, out_place, dense_tensor_dtype);
-          vec_inner_types.push_back(allocated_dense_tensor_dtype);
-        }
-      }
-
-      pir::Type t1 = pir::VectorType::get(ctx, vec_inner_types);
-      op_output_types.push_back(t1);
-    } else {
-      PADDLE_THROW(phi::errors::Unimplemented(
-          "Result type only support DenseTensorType, SelectedRowType and "
-          "VectorType"));
-    }
+    PushBackOutputTypes(
+        ctx, op_item, out_place, kernel_key, &op_output_types, i);
   }
 
   return op_output_types;
@@ -2074,6 +2179,18 @@ void ProcessBlock(
         op_item, place, kernel_name, *map_value_pair, op_info_parser.get());
     VLOG(6) << "kernel type " << kernel_key;
 
+    if (paddle::dialect::IsCustomOp(op_item)) {
+      HandleForCustomOp(ctx,
+                        op_item,
+                        kernel_key,
+                        place,
+                        op_info_parser.get(),
+                        map_op_pair,
+                        map_value_pair,
+                        new_block);
+      continue;
+    }
+
 #ifdef PADDLE_WITH_DNNL
     if (op_item->HasTrait<OneDNNTrait>() &&
         kernel_key.backend() != phi::Backend::ONEDNN) {
@@ -2147,6 +2264,8 @@ std::unique_ptr<pir::Program> PdOpLowerToKernelPass(pir::Program* prog,
   pir::IrContext* ctx = pir::IrContext::Instance();
   ctx->GetOrRegisterDialect<OperatorDialect>();
   ctx->GetOrRegisterDialect<KernelDialect>();
+  ctx->GetOrRegisterDialect<CustomKernelDialect>();
+
 #ifdef PADDLE_WITH_DNNL
   ctx->GetOrRegisterDialect<OneDNNOperatorDialect>();
   ctx->GetOrRegisterDialect<OneDNNKernelDialect>();
diff --git a/paddle/fluid/pybind/pir.cc b/paddle/fluid/pybind/pir.cc
index 8813ff59de53e..1c398cf7cdf97 100644
--- a/paddle/fluid/pybind/pir.cc
+++ b/paddle/fluid/pybind/pir.cc
@@ -1406,7 +1406,7 @@ std::map<int, int> GetOpInplaceInfo(const pir::Operation *op) {
   pir::OpInfo op_info = ctx->GetRegisteredOpInfo(op_name);
   paddle::dialect::OpYamlInfoParser yaml_parser(
       op_info.GetInterfaceImpl<paddle::dialect::OpYamlInfoInterface>()
-          ->get_op_info_(),
+          ->get_op_info_(op_name),
       paddle::dialect::IsLegacyOp(op_name));
 
   for (size_t i = 0; i < op->num_results(); ++i) {
diff --git a/paddle/phi/kernels/autotune/cache_base.h b/paddle/phi/kernels/autotune/cache_base.h
index 82af1ccbb7132..37f6106b1baa8 100644
--- a/paddle/phi/kernels/autotune/cache_base.h
+++ b/paddle/phi/kernels/autotune/cache_base.h
@@ -19,38 +19,12 @@
 #include <vector>
 
 #include "paddle/common/errors.h"
+#include "paddle/common/hash_funcs.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/flags.h"
 
 PHI_DECLARE_int32(search_cache_max_number);
 
-inline void HashCombine(std::size_t* seed UNUSED) {}
-
-// combine hash value
-// https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x
-template <typename T, typename... Rest>
-inline void HashCombine(std::size_t* seed, const T& v, Rest... rest) {
-  std::hash<T> hasher;
-  *seed ^= hasher(v) + 0x9e3779b9 + (*seed << 6) + (*seed >> 2);
-  *seed *= 0x00000100000001B3;
-  HashCombine(seed, rest...);
-}
-
-// custom specialization of std::hash can be injected in namespace std
-// ref: https://en.cppreference.com/w/cpp/utility/hash
-namespace std {
-template <typename T>
-struct hash<std::vector<T>> {
-  std::size_t operator()(std::vector<T> const& vec) const noexcept {
-    std::size_t seed = 0xcbf29ce484222325;
-    for (auto val : vec) {
-      HashCombine(&seed, val);
-    }
-    return seed;
-  }
-};
-}  // namespace std
-
 namespace phi {
 namespace autotune {
 
diff --git a/paddle/pir/core/builtin_type_storage.h b/paddle/pir/core/builtin_type_storage.h
index d8361658f9e85..77c3383f79798 100644
--- a/paddle/pir/core/builtin_type_storage.h
+++ b/paddle/pir/core/builtin_type_storage.h
@@ -16,28 +16,12 @@
 
 #include "paddle/common/ddim.h"
 #include "paddle/common/dim.h"
+#include "paddle/common/hash_funcs.h"
 #include "paddle/common/layout.h"
 #include "paddle/pir/core/type.h"
 #include "paddle/pir/core/type_base.h"
 #include "paddle/pir/core/utils.h"
 
-namespace std {
-///
-/// \brief Enable hashing std::vector<T> instances.
-///
-template <typename T>
-struct hash<std::vector<T>> {
-  std::size_t operator()(const std::vector<T>& dim) const {
-    std::size_t seed = 0;
-    for (size_t i = 0; i < dim.size(); ++i) {
-      seed ^= std::hash<T>()(dim[i]) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
-    }
-    return seed;
-  }
-};
-
-}  // namespace std
-
 namespace pir {
 ///
 /// \brief Define Parametric TypeStorage for DenseTensorType.
diff --git a/paddle/pir/core/operation.cc b/paddle/pir/core/operation.cc
index fc670d4e9e44e..c0ce8842155ab 100644
--- a/paddle/pir/core/operation.cc
+++ b/paddle/pir/core/operation.cc
@@ -117,7 +117,6 @@ Operation *Operation::Create(const std::vector<Value> &inputs,
       base_ptr += sizeof(detail::BlockOperandImpl);
     }
   }
-
   // 3.5. Construct Regions
   if (num_regions > 0) {
     op->regions_ = reinterpret_cast<Region *>(base_ptr);
@@ -126,7 +125,6 @@ Operation *Operation::Create(const std::vector<Value> &inputs,
       base_ptr += sizeof(Region);
     }
   }
-
   // 0. Verify
   if (op_info) {
     try {
diff --git a/test/cpp/pir/shape_dialect/CMakeLists.txt b/test/cpp/pir/shape_dialect/CMakeLists.txt
index 5c3aa2b9f4344..decfc90408846 100644
--- a/test/cpp/pir/shape_dialect/CMakeLists.txt
+++ b/test/cpp/pir/shape_dialect/CMakeLists.txt
@@ -1,8 +1,8 @@
-paddle_test(shape_op_test SRCS shape_op_test.cc DEPS gtest)
+paddle_test(shape_op_test SRCS shape_op_test.cc)
 
-paddle_test(shape_struct_test SRCS shape_struct_test.cc DEPS gtest)
+paddle_test(shape_struct_test SRCS shape_struct_test.cc)
 
-paddle_test(symbol_dim_expr_test SRCS symbol_dim_expr_test.cc DEPS gtest)
+paddle_test(symbol_dim_expr_test SRCS symbol_dim_expr_test.cc)
 
 paddle_test(symbol_dim_expr_util_test SRCS symbol_dim_expr_util_test.cc DEPS
             gtest)