From 3f48bafad2440fbb5eb6ff9889c9c168af510f81 Mon Sep 17 00:00:00 2001
From: rich04lin <152049331+rich04lin@users.noreply.github.com>
Date: Thu, 5 Dec 2024 00:04:02 +0800
Subject: [PATCH 1/9] [CodeStyle][Typos][B-14,B-[17-19]] Fix typos

---
 _typos.toml                                   |  9 +-----
 .../new_executor/standalone_executor.cc       |  4 +--
 .../include/serialize_utils.h                 |  2 +-
 .../fluid/pybind/manual_static_op_function.h  |  2 +-
 .../paddle/jit/dy2static/partial_program.py   | 32 +++++++++----------
 python/paddle/tensorrt/export.py              |  4 +--
 python/paddle/tensorrt/util.py                |  2 +-
 .../test_eager_run_program_deprecated.py      |  4 +--
 .../test_run_program_op_deprecated.py         |  4 +--
 test/legacy_test/test_elementwise_add_op.py   |  4 +--
 .../test_imperative_triple_grad.py            |  2 +-
 test/tensorrt/tensorrt_test_base.py           |  4 +--
 12 files changed, 33 insertions(+), 40 deletions(-)

diff --git a/_typos.toml b/_typos.toml
index 8052ec5a4a661..86b568ff08be0 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -12,6 +12,7 @@ extend-exclude = [
 anc = 'anc'
 arange = "arange"
 astroid = 'astroid'
+ba = 'ba'
 Clas = 'Clas'
 clen = 'clen'
 dout = "dout"
@@ -35,14 +36,6 @@ blcok = 'blcok'
 bootom = 'bootom'
 bondary = 'bondary'
 branchs = 'branchs'
-Broardcast = 'Broardcast'
-Bradcast = 'Bradcast'
-Boardcast = 'Boardcast'
-Buitin = 'Buitin'
-buitlin = 'buitlin'
-buitin = 'buitin'
-builded = 'builded'
-ba = 'ba'
 cahe = 'cahe'
 Caculate = 'Caculate'
 caculate = 'caculate'
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index c257b87cc4520..a97601b563ab9 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -159,9 +159,9 @@ StandaloneExecutor::StandaloneExecutor(const phi::Place& place,
                           common::errors::InvalidArgument(
                               "When using pipeline strategy in auto "
                               "prarallelism with new executor, "
-                              "the backward subprogram must be builded in real "
+                              "the backward subprogram must be built in real "
                               "static build mode, but it can not "
-                              "be staticly builded in this case. You can "
+                              "be staticly built in this case. You can "
                               "enable 'GLOG_v=1' to obtain log information."));
       }
     }
diff --git a/paddle/fluid/pir/serialize_deserialize/include/serialize_utils.h b/paddle/fluid/pir/serialize_deserialize/include/serialize_utils.h
index 707c0de0aaf9e..fffc9ceb6fb36 100644
--- a/paddle/fluid/pir/serialize_deserialize/include/serialize_utils.h
+++ b/paddle/fluid/pir/serialize_deserialize/include/serialize_utils.h
@@ -420,7 +420,7 @@ Json AttrTypeWriter::WriteBuiltInAttr(const pir::Attribute& attr) {
   } else {
     PADDLE_ENFORCE(false,
                    common::errors::InvalidArgument(
-                       "Unknown Attr %s when write Buitin dialect attr"));
+                       "Unknown Attr %s when write Builtin dialect attr"));
   }
   return attr_json;
 }
diff --git a/paddle/fluid/pybind/manual_static_op_function.h b/paddle/fluid/pybind/manual_static_op_function.h
index 25f0dba0bfbe8..5954d9d9a232e 100644
--- a/paddle/fluid/pybind/manual_static_op_function.h
+++ b/paddle/fluid/pybind/manual_static_op_function.h
@@ -966,7 +966,7 @@ static PyObject *builtin_combine_op(PyObject *self,
                                     PyObject *args,
                                     PyObject *kwargs) {
   try {
-    VLOG(6) << "Add buitin_combine op into program";
+    VLOG(6) << "Add builtin_combine op into program";
     VLOG(8) << "args count: " << (PyTuple_Size(args) / 2);
     // Get Value from args
     PyObject *x_obj = PyTuple_GET_ITEM(args, 0);
diff --git a/python/paddle/jit/dy2static/partial_program.py b/python/paddle/jit/dy2static/partial_program.py
index b870f1bbf4f18..b2cb13bdec318 100644
--- a/python/paddle/jit/dy2static/partial_program.py
+++ b/python/paddle/jit/dy2static/partial_program.py
@@ -808,15 +808,15 @@ def _prepare_attributes(self):
     @switch_to_static_graph
     def _build_infer_program(self, infer_program, forward_end_op_index):
         forward_skip_vars = self._parse_skip_gc_vars(infer_program)
-        builded_infer_program = add_build_strategy_for(
+        built_infer_program = add_build_strategy_for(
             infer_program,
             0,
             forward_end_op_index,
             self._build_strategy,
             forward_skip_vars,
         )
-        self._apply_inplace_pass(builded_infer_program, None)
-        return builded_infer_program
+        self._apply_inplace_pass(built_infer_program, None)
+        return built_infer_program
 
     @switch_to_static_graph
     def _get_forward_backward_program_form(
@@ -833,7 +833,7 @@ def _get_forward_backward_program_form(
         backward_skip_vars = self._parse_skip_gc_vars(
             whole_program
         ) + self._grad_var_names.get('param', [])
-        backward_builded_program = add_build_strategy_for(
+        backward_built_program = add_build_strategy_for(
             whole_program,
             backward_start_op_index,
             backward_end_op_index,
@@ -842,9 +842,9 @@ def _get_forward_backward_program_form(
         )
 
         forward_skip_vars = self._parse_skip_gc_vars(
-            whole_program, backward_builded_program
+            whole_program, backward_built_program
         )
-        forward_builded_program = add_build_strategy_for(
+        forward_built_program = add_build_strategy_for(
             whole_program,
             0,
             forward_end_op_index,
@@ -853,26 +853,26 @@ def _get_forward_backward_program_form(
         )
 
         self._apply_inplace_pass(
-            forward_builded_program, backward_builded_program
+            forward_built_program, backward_built_program
         )
 
         # NOTE(Aurelius84): Export forward/backward program for SubGraphChecker,
         # see export_subgraph for detail.
         pir_exporter(
             self,
-            forward_builded_program,
+            forward_built_program,
             SubGraphRole.Forward,
             set(),
             set(forward_skip_vars),
         )
         pir_exporter(
             self,
-            backward_builded_program,
+            backward_built_program,
             SubGraphRole.Backward,
             set(forward_skip_vars),
             set(backward_skip_vars),
         )
-        return [forward_builded_program, backward_builded_program]
+        return [forward_built_program, backward_built_program]
 
     def _apply_inplace_pass(self, forward_program, backward_program):
         attr_types = {
@@ -1157,19 +1157,19 @@ def add_build_strategy_for(
             core.Scope(), framework._current_expected_place()
         )
         ir_graph = framework.IrGraph(compiled_program._graph)
-        builded_program = ir_graph.to_program()
+        built_program = ir_graph.to_program()
         if hasattr(compiled_program._program, 'lr_scheduler'):
-            builded_program.lr_scheduler = (
+            built_program.lr_scheduler = (
                 compiled_program._program.lr_scheduler
             )
     else:
         # can't just create a new program, we need copy the vardesc.
-        builded_program = paddle.static.Program()
+        built_program = paddle.static.Program()
         for var in program.block(0).vars.values():
-            builded_program.block(0)._clone_variable(var, False)
+            built_program.block(0)._clone_variable(var, False)
 
     # set back the parent_idx of blocks
-    for origin, current in zip(program.blocks, builded_program.blocks):
+    for origin, current in zip(program.blocks, built_program.blocks):
         current.desc.set_parent_idx(origin.desc.parent)
 
-    return builded_program
+    return built_program
diff --git a/python/paddle/tensorrt/export.py b/python/paddle/tensorrt/export.py
index 53a2522031f7e..a045bc51071a4 100644
--- a/python/paddle/tensorrt/export.py
+++ b/python/paddle/tensorrt/export.py
@@ -35,7 +35,7 @@
 from paddle.tensorrt.converter import PaddleToTensorRTConverter
 from paddle.tensorrt.util import (
     forbid_op_lower_trt,
-    mark_buitlin_op,
+    mark_builtin_op,
     run_pir_pass,
     warmup_shape_infer,
 )
@@ -232,7 +232,7 @@ def convert_to_trt(program, trt_config, scope):
             forbid_op_lower_trt(program, trt_config.disable_ops)
 
         # Adding marker labels to builtin ops facilitates convert processing, but they ultimately do not enter the TensorRT subgraph.
-        mark_buitlin_op(program)
+        mark_builtin_op(program)
 
         # run pir pass (including trt_sub_graph_extract_pass)
         program_with_pir = run_pir_pass(program, partition_mode=True)
diff --git a/python/paddle/tensorrt/util.py b/python/paddle/tensorrt/util.py
index 72f917a84bfd3..bcfa46a3dd66f 100644
--- a/python/paddle/tensorrt/util.py
+++ b/python/paddle/tensorrt/util.py
@@ -122,7 +122,7 @@ def get_trt_version_list():
 
 
 # Adding marker labels to builtin ops facilitates convert processing, but they ultimately do not enter the TensorRT subgraph.
-def mark_buitlin_op(program):
+def mark_builtin_op(program):
     for op in program.global_block().ops:
         if op.name() == "builtin.split":
             defining_op = op.operands()[0].source().get_defining_op()
diff --git a/test/deprecated/legacy_test/test_eager_run_program_deprecated.py b/test/deprecated/legacy_test/test_eager_run_program_deprecated.py
index 00b29d9c0068b..4960b8a587f31 100644
--- a/test/deprecated/legacy_test/test_eager_run_program_deprecated.py
+++ b/test/deprecated/legacy_test/test_eager_run_program_deprecated.py
@@ -78,8 +78,8 @@ def _add_build_strategy_for(input_program, start_op_index, end_op_index):
         core.Scope(), paddle.framework._current_expected_place()
     )
     ir_graph = paddle.base.framework.IrGraph(compiled_program._graph)
-    builded_program = ir_graph.to_program()
-    return builded_program
+    built_program = ir_graph.to_program()
+    return built_program
 
 
 class TestRunProgram(unittest.TestCase):
diff --git a/test/deprecated/legacy_test/test_run_program_op_deprecated.py b/test/deprecated/legacy_test/test_run_program_op_deprecated.py
index 7b8b841368965..0e84d9227add1 100644
--- a/test/deprecated/legacy_test/test_run_program_op_deprecated.py
+++ b/test/deprecated/legacy_test/test_run_program_op_deprecated.py
@@ -47,8 +47,8 @@ def _add_build_strategy_for(input_program, start_op_index, end_op_index):
         core.Scope(), paddle.framework._current_expected_place()
     )
     ir_graph = paddle.base.framework.IrGraph(compiled_program._graph)
-    builded_program = ir_graph.to_program()
-    return builded_program
+    built_program = ir_graph.to_program()
+    return built_program
 
 
 @switch_to_static_graph
diff --git a/test/legacy_test/test_elementwise_add_op.py b/test/legacy_test/test_elementwise_add_op.py
index a3ccad172f7c2..0cf79f4ad7165 100644
--- a/test/legacy_test/test_elementwise_add_op.py
+++ b/test/legacy_test/test_elementwise_add_op.py
@@ -986,7 +986,7 @@ def if_enable_cinn(self):
         pass
 
 
-class TestElementwiseAddOpAutoParallelXShardBoardcast(
+class TestElementwiseAddOpAutoParallelXShardBroadcast(
     TestElementwiseAddOpAutoParallel
 ):
     def init_placements(self):
@@ -1023,7 +1023,7 @@ def init_input_output(self):
         self.out = np.add(self.x, self.y)
 
 
-class TestElementwiseAddOpAutoParallelXYShardBroardcast(
+class TestElementwiseAddOpAutoParallelXYShardBroadcast(
     TestElementwiseAddOpAutoParallelXYShard
 ):
     def init_placements(self):
diff --git a/test/legacy_test/test_imperative_triple_grad.py b/test/legacy_test/test_imperative_triple_grad.py
index 09372aaf9c976..60425c31c955e 100644
--- a/test/legacy_test/test_imperative_triple_grad.py
+++ b/test/legacy_test/test_imperative_triple_grad.py
@@ -227,7 +227,7 @@ def test_all_cases(self):
         self.func_example_with_gradient_and_create_graph()
 
 
-class TestDygraphTripleGradBradcastCase(TestCase):
+class TestDygraphTripleGradBroadcastCase(TestCase):
     def setUp(self):
         self.sort_sum_gradient = False
         self.x_shape = [3, 2, 2]
diff --git a/test/tensorrt/tensorrt_test_base.py b/test/tensorrt/tensorrt_test_base.py
index c4995adafdf47..bc4f53ada043e 100755
--- a/test/tensorrt/tensorrt_test_base.py
+++ b/test/tensorrt/tensorrt_test_base.py
@@ -21,7 +21,7 @@
 from paddle.base import core
 from paddle.tensorrt.converter import PaddleToTensorRTConverter
 from paddle.tensorrt.util import (
-    mark_buitlin_op,
+    mark_builtin_op,
     run_pir_pass,
     warmup_shape_infer,
 )
@@ -242,7 +242,7 @@ def check_trt_result(self, rtol=1e-5, atol=1e-5):
             main_program = run_pir_pass(main_program, partition_mode=False)
 
             # Adding marker labels to builtin ops facilitates convert processing, but they ultimately do not enter the TensorRT subgraph.
-            mark_buitlin_op(main_program)
+            mark_builtin_op(main_program)
 
             # run trt_sub_graph_extract_pass()
             program_with_trt = run_pir_pass(main_program, partition_mode=True)

From 125ce6654af5a22e92ea4fff113d2be4cfe6cdd6 Mon Sep 17 00:00:00 2001
From: rich04lin <152049331+rich04lin@users.noreply.github.com>
Date: Thu, 5 Dec 2024 13:01:45 +0800
Subject: [PATCH 2/9] [CodeStyle][Typos][B-14,B-[17-19]] Fix
 typos(Broardcast,Bradcast,Boardcast,buitin,buitlin,Buitin,builded,ba)

---
 python/paddle/jit/dy2static/partial_program.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/python/paddle/jit/dy2static/partial_program.py b/python/paddle/jit/dy2static/partial_program.py
index b2cb13bdec318..51ec53b968e24 100644
--- a/python/paddle/jit/dy2static/partial_program.py
+++ b/python/paddle/jit/dy2static/partial_program.py
@@ -852,9 +852,7 @@ def _get_forward_backward_program_form(
             forward_skip_vars,
         )
 
-        self._apply_inplace_pass(
-            forward_built_program, backward_built_program
-        )
+        self._apply_inplace_pass(forward_built_program, backward_built_program)
 
         # NOTE(Aurelius84): Export forward/backward program for SubGraphChecker,
         # see export_subgraph for detail.
@@ -1159,9 +1157,7 @@ def add_build_strategy_for(
         ir_graph = framework.IrGraph(compiled_program._graph)
         built_program = ir_graph.to_program()
         if hasattr(compiled_program._program, 'lr_scheduler'):
-            built_program.lr_scheduler = (
-                compiled_program._program.lr_scheduler
-            )
+            built_program.lr_scheduler = compiled_program._program.lr_scheduler
     else:
         # can't just create a new program, we need copy the vardesc.
         built_program = paddle.static.Program()

From 904179008a0fa63aee22fd54e193cedc2dfe51a2 Mon Sep 17 00:00:00 2001
From: rich04lin <152049331+rich04lin@users.noreply.github.com>
Date: Fri, 6 Dec 2024 23:57:11 +0800
Subject: [PATCH 3/9] [CodeStyle][Typos][C-[4-9] Fix
 typos(cacl,cll,candiate,cadidate,connot,CANN,Cann,cann,vart)

---
 _typos.toml                                            | 10 +---------
 paddle/cinn/hlir/framework/pir/trivial_op_util.cc      |  2 +-
 paddle/fluid/operators/data_norm_op.cu                 |  2 +-
 .../fluid/pir/dialect/distributed/ir/dist_attribute.cc |  2 +-
 .../phi/core/distributed/auto_parallel/dist_tensor.cc  |  2 +-
 paddle/phi/kernels/xpu/fused_attention_kernel.cc       |  6 +++---
 paddle/pir/src/core/op_info_impl.cc                    |  2 +-
 python/paddle/base/backward.py                         |  2 +-
 python/paddle/base/framework.py                        |  6 +++---
 python/paddle/distributed/auto_parallel/constants.py   |  4 ++--
 python/paddle/distribution/kl.py                       |  2 +-
 python/paddle/nn/functional/activation.py              |  4 ++--
 python/paddle/nn/layer/activation.py                   |  4 ++--
 python/paddle/optimizer/optimizer.py                   |  2 +-
 test/deprecated/legacy_test/auto_parallel_op_test.py   |  2 +-
 test/ir/inference/test_fc_fuse_pass.py                 |  2 +-
 test/legacy_test/auto_parallel_op_test.py              |  2 +-
 test/legacy_test/test_nanmedian.py                     |  8 ++++----
 18 files changed, 28 insertions(+), 36 deletions(-)

diff --git a/_typos.toml b/_typos.toml
index 8697561e4d946..4169cb7566776 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -13,6 +13,7 @@ anc = 'anc'
 arange = "arange"
 astroid = 'astroid'
 ba = 'ba'
+CANN = 'CANN'
 Clas = 'Clas'
 clen = 'clen'
 dout = "dout"
@@ -36,15 +37,6 @@ cahe = 'cahe'
 Caculate = 'Caculate'
 caculate = 'caculate'
 calcualtion = 'calcualtion'
-cacl = 'cacl'
-cll = 'cll'
-candiate = 'candiate'
-cadidate = 'cadidate'
-connot = 'connot'
-CANN = 'CANN'
-Cann = 'Cann'
-cann = 'cann'
-vart = 'vart'
 checkings = 'checkings'
 childs = 'childs'
 comsume = 'comsume'
diff --git a/paddle/cinn/hlir/framework/pir/trivial_op_util.cc b/paddle/cinn/hlir/framework/pir/trivial_op_util.cc
index 5d7d4d35d910a..aa61b5e5d41f9 100644
--- a/paddle/cinn/hlir/framework/pir/trivial_op_util.cc
+++ b/paddle/cinn/hlir/framework/pir/trivial_op_util.cc
@@ -160,7 +160,7 @@ ir::Expr CopyedReplaceExpr(const Expr& source,
       candidates.size(),
       ::common::errors::InvalidArgument(
           "In ReplaceExpr, the size of Vars to be replaced must be equal to "
-          "the size of cadidate Exprs! Please check."));
+          "the size of candidate Exprs! Please check."));
   auto copyed_source = ir::ir_utils::IRCopy(source);
   if (replaced.empty()) return copyed_source;
   std::map<Var, Expr, ir::CompVar> replacing_map;
diff --git a/paddle/fluid/operators/data_norm_op.cu b/paddle/fluid/operators/data_norm_op.cu
index 179c13d9d36fb..7b3fc74d2a0d2 100644
--- a/paddle/fluid/operators/data_norm_op.cu
+++ b/paddle/fluid/operators/data_norm_op.cu
@@ -300,7 +300,7 @@ class DataNormGradKernel<T, phi::GPUContext> : public framework::OpKernel<T> {
       phi::backends::gpu::GpuStreamSync(stream);
 #else
       PADDLE_THROW(common::errors::PreconditionNotMet(
-          "PaddlePaddle should compile with GPU, and need_sync_stats connot be "
+          "PaddlePaddle should compile with GPU, and need_sync_stats cannot be "
           "supported on windows now."));
 #endif
     }
diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_attribute.cc b/paddle/fluid/pir/dialect/distributed/ir/dist_attribute.cc
index 16db2c543e2c5..e2bcbf3d718c5 100644
--- a/paddle/fluid/pir/dialect/distributed/ir/dist_attribute.cc
+++ b/paddle/fluid/pir/dialect/distributed/ir/dist_attribute.cc
@@ -77,7 +77,7 @@ phi::distributed::Placements TensorDistAttribute::placements() const {
       auto& p = placements[mesh_id];
       if (p->is_shard()) {
         PADDLE_THROW(common::errors::PreconditionNotMet(
-            "ProcessMesh dimension cann't be mapped to two  dimension of the "
+            "ProcessMesh dimension can't be mapped to two  dimension of the "
             "same tensor: {%d} and {%d}",
             i,
             dynamic_cast<phi::distributed::Shard&>(*p).get_dim()));
diff --git a/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc b/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc
index cc22d17867ef9..50e3a6cca00d6 100644
--- a/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc
+++ b/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc
@@ -104,7 +104,7 @@ Placements ToPlacements(const TensorDistAttr& dist_attr) {
 
       if (p->is_shard()) {
         PADDLE_THROW(common::errors::PreconditionNotMet(
-            "ProcessMesh dimension cann't be mapped to two  dimension of the "
+            "ProcessMesh dimension can't be mapped to two  dimension of the "
             "same tensor: {%d} and {%d}",
             i,
             dynamic_cast<Shard&>(*p).get_dim()));
diff --git a/paddle/phi/kernels/xpu/fused_attention_kernel.cc b/paddle/phi/kernels/xpu/fused_attention_kernel.cc
index b7a1c8a638648..cbc8929aed90b 100644
--- a/paddle/phi/kernels/xpu/fused_attention_kernel.cc
+++ b/paddle/phi/kernels/xpu/fused_attention_kernel.cc
@@ -233,7 +233,7 @@ void FusedAttentionKernel(const Context &dev_ctx,
   }
 
   int r = 0;
-  const XPUTypeT *x_cacl_ptr = input_x_ptr;
+  const XPUTypeT *x_calc_ptr = input_x_ptr;
   if (pre_layer_norm) {
     r = xpu::layer_norm(xpu_ctx,
                         input_x_ptr,
@@ -246,7 +246,7 @@ void FusedAttentionKernel(const Context &dev_ctx,
                         ln_mean_ptr,
                         ln_var_ptr);
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "layer_norm");
-    x_cacl_ptr = ln_out_ptr;
+    x_calc_ptr = ln_out_ptr;
   }
 
   // fc
@@ -262,7 +262,7 @@ void FusedAttentionKernel(const Context &dev_ctx,
                          nullptr);
 
   phi::MatMulXPUFunction<XPUTypeT>(xpu_ctx,
-                                   x_cacl_ptr,
+                                   x_calc_ptr,
                                    qkv_weight_ptr,
                                    qkv_before_transpose_ptr,
                                    qkv_fc_info,
diff --git a/paddle/pir/src/core/op_info_impl.cc b/paddle/pir/src/core/op_info_impl.cc
index 08978ff061a65..8c262326ee516 100644
--- a/paddle/pir/src/core/op_info_impl.cc
+++ b/paddle/pir/src/core/op_info_impl.cc
@@ -23,7 +23,7 @@ namespace pir {
 void OpInfo::AttachInterface(InterfaceValue &&interface_value) {
   PADDLE_ENFORCE_NOT_NULL(impl_,
                           common::errors::InvalidArgument(
-                              "Cann't attach interface to a nullptr OpInfo"));
+                              "Can't attach interface to a nullptr OpInfo"));
   impl_->AttachInterface(std::move(interface_value));
 }
 
diff --git a/python/paddle/base/backward.py b/python/paddle/base/backward.py
index 1ae69767335df..6b993b0f3482b 100755
--- a/python/paddle/base/backward.py
+++ b/python/paddle/base/backward.py
@@ -1840,7 +1840,7 @@ def infershape_for_composite(block, grad_op_desc):
                 for name, args in grad_op_desc.outputs().items()
             },
             # NOTE Runtime attr will be ignore as the c++ GetRuntimeAttr
-            # interface cann't be exported to python. Please note the WARNING
+            # interface can't be exported to python. Please note the WARNING
             # message logged in RuntimeAttrs of composite_grad_desc_maker.h
             attrs=grad_op_desc.get_attr_map(),
         )
diff --git a/python/paddle/base/framework.py b/python/paddle/base/framework.py
index 4c6080a97ac76..9c1be41ab6b9f 100644
--- a/python/paddle/base/framework.py
+++ b/python/paddle/base/framework.py
@@ -5556,7 +5556,7 @@ def create_persistable_node(self, name, var_type, shape, var_dtype):
 
         Args:
             name(str): the name of the persistable variable node.
-            vart_type(core.VarDesc.VarType): the type of the persistable variable node.
+            var_type(core.VarDesc.VarType): the type of the persistable variable node.
             shape(list): the shape of the persistable variable node.
             var_dtype(core.VarDesc.VarType): the data type of the persistable variable node.
 
@@ -5577,7 +5577,7 @@ def create_var_node(self, name, var_type, shape, var_dtype):
 
         Args:
             name(str): the name of the variable node.
-            vart_type(core.VarDesc.VarType): the type of the variable node.
+            var_type(core.VarDesc.VarType): the type of the variable node.
             shape(list): the shape of the variable node.
             var_dtype(core.VarDesc.VarType): the data type of the variable node.
 
@@ -6852,7 +6852,7 @@ def _remove_training_info(self, clip_extra=True):
         res.blocks = [Block(res, i) for i in range(res.desc.num_blocks())]
         res._sync_with_cpp()
 
-        # Note: The op_role and op_role_var cann't be deleted currently,
+        # Note: The op_role and op_role_var can't be deleted currently,
         # and we will try to remove them in the future.
         common_clipped_attrs_list = ["op_callstack", "with_quant_attr"]
 
diff --git a/python/paddle/distributed/auto_parallel/constants.py b/python/paddle/distributed/auto_parallel/constants.py
index 74f9a3f3660e7..362dd8170eedb 100644
--- a/python/paddle/distributed/auto_parallel/constants.py
+++ b/python/paddle/distributed/auto_parallel/constants.py
@@ -320,7 +320,7 @@ class _FusedPassesConfig(TypedDict, total=False):  # noqa: PYI049
 set_field_default_config(DP_OPTIMIZATION, "enable", False)
 set_field_default_config(DP_OPTIMIZATION, "fuse_all_reduce_ops", True)
 set_field_default_config(DP_OPTIMIZATION, "fuse_grad_size_in_MB", 32)
-set_field_default_config(DP_OPTIMIZATION, "overlap_comm_cacl", True)
+set_field_default_config(DP_OPTIMIZATION, "overlap_comm_calc", True)
 set_field_default_config(
     DP_OPTIMIZATION, "gradient_sync_after_accumulate", False
 )
@@ -331,7 +331,7 @@ class _DPOptimizationConfig(TypedDict, total=False):  # noqa: PYI049
         enable: bool
         fuse_all_reduce_ops: bool
         fuse_grad_size_in_MB: int
-        overlap_comm_cacl: bool
+        overlap_comm_calc: bool
         gradient_sync_after_accumulate: bool
 
 
diff --git a/python/paddle/distribution/kl.py b/python/paddle/distribution/kl.py
index de8e12ff0071a..53c6e7778c69b 100644
--- a/python/paddle/distribution/kl.py
+++ b/python/paddle/distribution/kl.py
@@ -264,7 +264,7 @@ def _kl_expfamily_expfamily(p, q):
             p_grads = paddle.static.gradients(p_log_norm, p_natural_params)
     except RuntimeError as e:
         raise TypeError(
-            "Cann't compute kl_divergence({cls_p}, {cls_q}) use bregman divergence. Please register_kl({cls_p}, {cls_q}).".format(
+            "Can't compute kl_divergence({cls_p}, {cls_q}) use bregman divergence. Please register_kl({cls_p}, {cls_q}).".format(
                 cls_p=type(p).__name__, cls_q=type(q).__name__
             )
         ) from e
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index c3ae8f572914b..8f545043441a6 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -279,7 +279,7 @@ def hardtanh(
 
         hardtanh(x)=
             \left\{
-                \begin{array}{cll}
+                \begin{array}{cl}
                     max,& & \text{if } x > max \\
                     min,& & \text{if } x < min \\
                     x,& & \text{otherwise}
@@ -410,7 +410,7 @@ def hardswish(x: Tensor, name: str | None = None) -> Tensor:
 
         hardswish(x)=
             \left\{
-                \begin{array}{cll}
+                \begin{array}{cl}
                 0 &, & \text{if } x \leq -3 \\
                 x &, & \text{if } x \geq 3 \\
                 \frac{x(x+3)}{6} &, & \text{otherwise}
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index 1edbc26169f7a..cea146521bed8 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -293,7 +293,7 @@ class Hardswish(Layer):
 
         Hardswish(x)=
             \left\{
-                \begin{array}{cll}
+                \begin{array}{cl}
                 0 &, & \text{if } x \leq -3 \\
                 x &, & \text{if } x \geq 3 \\
                 \frac{x(x+3)}{6} &, & \text{otherwise}
@@ -384,7 +384,7 @@ class Hardtanh(Layer):
 
         Hardtanh(x)=
             \left\{
-                \begin{array}{cll}
+                \begin{array}{cl}
                     max,& & \text{if } x > max \\
                     min,& & \text{if } x < min \\
                     x,& & \text{otherwise}
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index 79420f892f749..f9fe375a6810a 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -803,7 +803,7 @@ def _global_learning_rate(self, program=None):
     def _append_optimize_op(self, block, param_and_grad):
         """append optimize operator to block and return all the added optimize_op"""
         raise NotImplementedError(
-            'Class "Optimizer" connot be used directly as an optimizer, please use its subclasses such as "Adam"'
+            'Class "Optimizer" cannot be used directly as an optimizer, please use its subclasses such as "Adam"'
         )
 
     def _create_param_lr(self, param_and_grad):
diff --git a/test/deprecated/legacy_test/auto_parallel_op_test.py b/test/deprecated/legacy_test/auto_parallel_op_test.py
index 1e6c9dfe4547d..5efe97b6e8c97 100644
--- a/test/deprecated/legacy_test/auto_parallel_op_test.py
+++ b/test/deprecated/legacy_test/auto_parallel_op_test.py
@@ -404,7 +404,7 @@ def dims_map_to_placements(
             if placement.is_shard():
                 placement = cast(dist.Shard, placement)
                 raise RuntimeError(
-                    f"DeviceMesh dimension cann't be mapped to two dimension of the same tensor: {i} and {placement.dim}"
+                    f"DeviceMesh dimension can't be mapped to two dimension of the same tensor: {i} and {placement.dim}"
                 )
             elif placement.is_partial():
                 raise RuntimeError(
diff --git a/test/ir/inference/test_fc_fuse_pass.py b/test/ir/inference/test_fc_fuse_pass.py
index 237faff87149e..2af6732700f67 100644
--- a/test/ir/inference/test_fc_fuse_pass.py
+++ b/test/ir/inference/test_fc_fuse_pass.py
@@ -55,7 +55,7 @@ def teller1(program_config, predictor_config):
             bias_shape = list(program_config.weights["bias"].shape)
 
             if predictor_config.tensorrt_engine_enabled():
-                # TensorRT cann't handle all the situation of elementwise_add
+                # TensorRT can't handle all the situation of elementwise_add
                 # disable it until this problem fixed
                 predictor_config.exp_disable_tensorrt_ops(["elementwise_add"])
 
diff --git a/test/legacy_test/auto_parallel_op_test.py b/test/legacy_test/auto_parallel_op_test.py
index c74ee27e07ff6..5265ecbdfeda9 100644
--- a/test/legacy_test/auto_parallel_op_test.py
+++ b/test/legacy_test/auto_parallel_op_test.py
@@ -403,7 +403,7 @@ def dims_map_to_placements(
             if placement.is_shard():
                 placement = cast(dist.Shard, placement)
                 raise RuntimeError(
-                    f"DeviceMesh dimension cann't be mapped to two dimension of the same tensor: {i} and {placement.dim}"
+                    f"DeviceMesh dimension can't be mapped to two dimension of the same tensor: {i} and {placement.dim}"
                 )
             elif placement.is_partial():
                 raise RuntimeError(
diff --git a/test/legacy_test/test_nanmedian.py b/test/legacy_test/test_nanmedian.py
index 3507928b9014b..9695c0da35179 100644
--- a/test/legacy_test/test_nanmedian.py
+++ b/test/legacy_test/test_nanmedian.py
@@ -147,7 +147,7 @@ def setUp(self):
             if core.is_compiled_with_cuda()
             else paddle.CPUPlace()
         )
-        self.axis_candiate_list = [
+        self.axis_candidate_list = [
             None,
             0,
             2,
@@ -231,7 +231,7 @@ def test_axis_case(data, axis):
         for name, data in self.fake_data.items():
             test_data_case(data, name)
 
-        for axis in self.axis_candiate_list:
+        for axis in self.axis_candidate_list:
             test_axis_case(self.fake_data["row_nan_even"], axis)
             test_axis_case(self.fake_data["col_nan_odd"], axis)
 
@@ -402,7 +402,7 @@ def setUp(self):
             if core.is_compiled_with_cuda()
             else paddle.CPUPlace()
         )
-        self.axis_candiate_list = [
+        self.axis_candidate_list = [
             None,
             0,
             2,
@@ -480,7 +480,7 @@ def test_axis_case(data, axis):
         for name, data in self.fake_data.items():
             test_data_case(data, name)
 
-        for axis in self.axis_candiate_list:
+        for axis in self.axis_candidate_list:
             test_axis_case(self.fake_data["row_nan_even"], axis)
             test_axis_case(self.fake_data["col_nan_odd"], axis)
 

From 62692027954babfcb446beebce7a86eb59452dc6 Mon Sep 17 00:00:00 2001
From: rich04lin <152049331+rich04lin@users.noreply.github.com>
Date: Sat, 7 Dec 2024 16:08:01 +0800
Subject: [PATCH 4/9] c4-9

---
 _typos.toml                               | 1 +
 python/paddle/nn/functional/activation.py | 4 ++--
 python/paddle/nn/layer/activation.py      | 4 ++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/_typos.toml b/_typos.toml
index 4169cb7566776..5a34c907ced56 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -16,6 +16,7 @@ ba = 'ba'
 CANN = 'CANN'
 Clas = 'Clas'
 clen = 'clen'
+cll = 'cll'
 dout = "dout"
 eles = 'eles'
 grad = "grad"
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 8f545043441a6..c3ae8f572914b 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -279,7 +279,7 @@ def hardtanh(
 
         hardtanh(x)=
             \left\{
-                \begin{array}{cl}
+                \begin{array}{cll}
                     max,& & \text{if } x > max \\
                     min,& & \text{if } x < min \\
                     x,& & \text{otherwise}
@@ -410,7 +410,7 @@ def hardswish(x: Tensor, name: str | None = None) -> Tensor:
 
         hardswish(x)=
             \left\{
-                \begin{array}{cl}
+                \begin{array}{cll}
                 0 &, & \text{if } x \leq -3 \\
                 x &, & \text{if } x \geq 3 \\
                 \frac{x(x+3)}{6} &, & \text{otherwise}
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index cea146521bed8..1edbc26169f7a 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -293,7 +293,7 @@ class Hardswish(Layer):
 
         Hardswish(x)=
             \left\{
-                \begin{array}{cl}
+                \begin{array}{cll}
                 0 &, & \text{if } x \leq -3 \\
                 x &, & \text{if } x \geq 3 \\
                 \frac{x(x+3)}{6} &, & \text{otherwise}
@@ -384,7 +384,7 @@ class Hardtanh(Layer):
 
         Hardtanh(x)=
             \left\{
-                \begin{array}{cl}
+                \begin{array}{cll}
                     max,& & \text{if } x > max \\
                     min,& & \text{if } x < min \\
                     x,& & \text{otherwise}

From 5d92c40aeb2bc227b041ec5d7cb35c271418a90a Mon Sep 17 00:00:00 2001
From: rich04lin <152049331+rich04lin@users.noreply.github.com>
Date: Sun, 8 Dec 2024 00:27:50 +0800
Subject: [PATCH 5/9] [CodeStyle][Typos][B-14,B-[17-19]] Fix
 typos(Broardcast,Bradcast,Boardcast,buitin,buitlin,Buitin,builded,ba)

---
 _typos.toml                                          | 1 +
 paddle/phi/kernels/xpu/fused_attention_kernel.cc     | 6 +++---
 python/paddle/distributed/auto_parallel/constants.py | 4 ++--
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/_typos.toml b/_typos.toml
index 5a34c907ced56..86b9f2bb7cd4b 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -13,6 +13,7 @@ anc = 'anc'
 arange = "arange"
 astroid = 'astroid'
 ba = 'ba'
+cacl = 'cacl'
 CANN = 'CANN'
 Clas = 'Clas'
 clen = 'clen'
diff --git a/paddle/phi/kernels/xpu/fused_attention_kernel.cc b/paddle/phi/kernels/xpu/fused_attention_kernel.cc
index cbc8929aed90b..b7a1c8a638648 100644
--- a/paddle/phi/kernels/xpu/fused_attention_kernel.cc
+++ b/paddle/phi/kernels/xpu/fused_attention_kernel.cc
@@ -233,7 +233,7 @@ void FusedAttentionKernel(const Context &dev_ctx,
   }
 
   int r = 0;
-  const XPUTypeT *x_calc_ptr = input_x_ptr;
+  const XPUTypeT *x_cacl_ptr = input_x_ptr;
   if (pre_layer_norm) {
     r = xpu::layer_norm(xpu_ctx,
                         input_x_ptr,
@@ -246,7 +246,7 @@ void FusedAttentionKernel(const Context &dev_ctx,
                         ln_mean_ptr,
                         ln_var_ptr);
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "layer_norm");
-    x_calc_ptr = ln_out_ptr;
+    x_cacl_ptr = ln_out_ptr;
   }
 
   // fc
@@ -262,7 +262,7 @@ void FusedAttentionKernel(const Context &dev_ctx,
                          nullptr);
 
   phi::MatMulXPUFunction<XPUTypeT>(xpu_ctx,
-                                   x_calc_ptr,
+                                   x_cacl_ptr,
                                    qkv_weight_ptr,
                                    qkv_before_transpose_ptr,
                                    qkv_fc_info,
diff --git a/python/paddle/distributed/auto_parallel/constants.py b/python/paddle/distributed/auto_parallel/constants.py
index 362dd8170eedb..74f9a3f3660e7 100644
--- a/python/paddle/distributed/auto_parallel/constants.py
+++ b/python/paddle/distributed/auto_parallel/constants.py
@@ -320,7 +320,7 @@ class _FusedPassesConfig(TypedDict, total=False):  # noqa: PYI049
 set_field_default_config(DP_OPTIMIZATION, "enable", False)
 set_field_default_config(DP_OPTIMIZATION, "fuse_all_reduce_ops", True)
 set_field_default_config(DP_OPTIMIZATION, "fuse_grad_size_in_MB", 32)
-set_field_default_config(DP_OPTIMIZATION, "overlap_comm_calc", True)
+set_field_default_config(DP_OPTIMIZATION, "overlap_comm_cacl", True)
 set_field_default_config(
     DP_OPTIMIZATION, "gradient_sync_after_accumulate", False
 )
@@ -331,7 +331,7 @@ class _DPOptimizationConfig(TypedDict, total=False):  # noqa: PYI049
         enable: bool
         fuse_all_reduce_ops: bool
         fuse_grad_size_in_MB: int
-        overlap_comm_calc: bool
+        overlap_comm_cacl: bool
         gradient_sync_after_accumulate: bool
 
 

From 7718b832a40a1c7050e714283370ae3962a6ab6c Mon Sep 17 00:00:00 2001
From: rich04lin <152049331+rich04lin@users.noreply.github.com>
Date: Tue, 10 Dec 2024 14:00:06 +0800
Subject: [PATCH 6/9] c48-51

---
 _typos.toml                                                  | 4 ----
 paddle/fluid/distributed/ps/service/brpc_ps_client.cc        | 4 ++--
 .../new_executor/instruction/control_flow/if_instruction.cc  | 5 +++--
 .../instruction/control_flow/while_instruction.cc            | 2 +-
 .../framework/new_executor/instruction/instruction_util.cc   | 4 ++--
 .../framework/new_executor/instruction/instruction_util.h    | 2 +-
 paddle/phi/kernels/funcs/fft_key.h                           | 4 ++--
 python/paddle/distributed/passes/ps_trainer_pass.py          | 2 +-
 8 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/_typos.toml b/_typos.toml
index 86b9f2bb7cd4b..d5d8360c0e6fe 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -41,10 +41,6 @@ caculate = 'caculate'
 calcualtion = 'calcualtion'
 checkings = 'checkings'
 childs = 'childs'
-comsume = 'comsume'
-Continer = 'Continer'
-contenst = 'contenst'
-conter = 'conter'
 Continous = 'Continous'
 contibute = 'contibute'
 controled = 'controled'
diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
index 519d39484a7c5..a724e55be391b 100644
--- a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
@@ -1531,7 +1531,7 @@ std::future<int32_t> BrpcPsClient::PushSparse(size_t table_id,
   CostTimer parse_timer("pserver_client_push_sparse_parse");
   int push_sparse_async_num = _push_sparse_task_queue_map[table_id]->Size();
   while (push_sparse_async_num > FLAGS_pserver_max_async_call_num) {
-    //    LOG(INFO) << "PushSparse Waiting for async_call_num comsume,
+    //    LOG(INFO) << "PushSparse Waiting for async_call_num consume,
     //    task_num:"
     //              << push_sparse_async_num
     //              << ", max_task_limit:" << FLAGS_pserver_max_async_call_num;
@@ -1892,7 +1892,7 @@ std::future<int32_t> BrpcPsClient::PushDense(const Region *regions,
       std::make_shared<CostTimer>("pserver_client_push_dense_parse");
   int push_dense_async_num = _push_dense_task_queue_map[table_id]->Size();
   while (push_dense_async_num > FLAGS_pserver_max_async_call_num) {
-    //    LOG(INFO) << "PushDense Waiting for async_call_num comsume,
+    //    LOG(INFO) << "PushDense Waiting for async_call_num consume,
     //    task_num:"
     //              << push_dense_async_num
     //              << ", max_task_limit:" << FLAGS_pserver_max_async_call_num;
diff --git a/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc b/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc
index bbbcaf9c64815..1b1231359fe83 100644
--- a/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc
@@ -121,8 +121,9 @@ IfInstruction::IfInstruction(size_t id,
       is_last_op = false;
     }
   }
-  InsertTuplePushContinerToOuts(&true_branch_block, *value_exec_info, &outputs);
-  InsertTuplePushContinerToOuts(
+  InsertTuplePushContainerToOuts(
+      &true_branch_block, *value_exec_info, &outputs);
+  InsertTuplePushContainerToOuts(
       &if_op.false_block(), *value_exec_info, &outputs);
 
   InsertInplacedExternalInputsToOuts(
diff --git a/paddle/fluid/framework/new_executor/instruction/control_flow/while_instruction.cc b/paddle/fluid/framework/new_executor/instruction/control_flow/while_instruction.cc
index bdd6c97e61631..d807c64ccee7d 100644
--- a/paddle/fluid/framework/new_executor/instruction/control_flow/while_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/control_flow/while_instruction.cc
@@ -110,7 +110,7 @@ WhileInstruction::WhileInstruction(
       outputs.emplace(value, outputs_id);
     }
   }
-  InsertTuplePushContinerToOuts(body_block_, *parent_exe_info, &outputs);
+  InsertTuplePushContainerToOuts(body_block_, *parent_exe_info, &outputs);
   InsertInplacedExternalInputsToOuts(
       body_block_, body_outside_inputs, *parent_exe_info, &outputs);
   SetOutputs(outputs);
diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_util.cc b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc
index 65beeb8dfeb27..19b3c29a2c485 100644
--- a/paddle/fluid/framework/new_executor/instruction/instruction_util.cc
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc
@@ -403,7 +403,7 @@ std::unordered_set<pir::Value> GetTuplePushContainer(pir::Block* block) {
   return inner_outputs;
 }
 
-void InsertTuplePushContinerToOuts(
+void InsertTuplePushContainerToOuts(
     pir::Block* block,
     const ValueExecutionInfo& value_exec_info,
     std::unordered_map<pir::Value, std::vector<int>>* outputs) {
@@ -412,7 +412,7 @@ void InsertTuplePushContinerToOuts(
 
   for (pir::Value value : inner_stack_outputs) {
     outputs->emplace(value, GetValueIds(value, value_exec_info));
-    VLOG(6) << "InsertTuplePushContinerToOuts of " << value.impl();
+    VLOG(6) << "InsertTuplePushContainerToOuts of " << value.impl();
   }
 }
 
diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_util.h b/paddle/fluid/framework/new_executor/instruction/instruction_util.h
index 787c1099044a8..2887d3c4aca2f 100644
--- a/paddle/fluid/framework/new_executor/instruction/instruction_util.h
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_util.h
@@ -50,7 +50,7 @@ std::vector<pir::Value> GetExternalInputs(
     const ValueExecutionInfo& value_exec_info,
     std::unordered_map<pir::Value, std::vector<int>>* input_ids);
 
-void InsertTuplePushContinerToOuts(
+void InsertTuplePushContainerToOuts(
     pir::Block* block,
     const ValueExecutionInfo& value_exec_info,
     std::unordered_map<pir::Value, std::vector<int>>* outputs);
diff --git a/paddle/phi/kernels/funcs/fft_key.h b/paddle/phi/kernels/funcs/fft_key.h
index 8a577754cf051..d0e6f603bd377 100644
--- a/paddle/phi/kernels/funcs/fft_key.h
+++ b/paddle/phi/kernels/funcs/fft_key.h
@@ -58,7 +58,7 @@ struct FFTConfigKey {
 template <typename Key>
 struct KeyHash {
   // Key must be a POD because we read out its memory
-  // contenst as char* when hashing
+  // contents as char* when hashing
   static_assert(std::is_pod<Key>::value, "Key must be plain old data type");
 
   size_t operator()(const Key& params) const {
@@ -75,7 +75,7 @@ struct KeyHash {
 template <typename Key>
 struct KeyEqual {
   // Key must be a POD because we read out its memory
-  // contenst as char* when comparing
+  // contents as char* when comparing
   static_assert(std::is_pod<Key>::value, "Key must be plain old data type");
 
   bool operator()(const Key& a, const Key& b) const {
diff --git a/python/paddle/distributed/passes/ps_trainer_pass.py b/python/paddle/distributed/passes/ps_trainer_pass.py
index 522bf6daa4bc4..84860d0bc3807 100755
--- a/python/paddle/distributed/passes/ps_trainer_pass.py
+++ b/python/paddle/distributed/passes/ps_trainer_pass.py
@@ -1014,7 +1014,7 @@ def _create_heter_program(
             block_var_detail[stage_id - 1]["backward"]["persistables"],
         )
 
-        # add step conter
+        # add step counter
         send_input_vars = []
         dummy_output = []
         pserver_endpoints = get_ps_endpoints(role_maker)

From cf48a28934442b8fd4c29e03c7dbf468503ed109 Mon Sep 17 00:00:00 2001
From: rich04lin <152049331+rich04lin@users.noreply.github.com>
Date: Sun, 15 Dec 2024 17:18:34 +0800
Subject: [PATCH 7/9] c53-58

---
 _typos.toml                                        | 14 +++++++-------
 paddle/fluid/operators/print_op.cc                 |  2 +-
 paddle/phi/backends/onednn/onednn_reuse.h          |  2 +-
 paddle/phi/kernels/funcs/weight_only_gemv.cu       | 10 +++++-----
 python/paddle/jit/dy2static/program_translator.py  |  2 +-
 test/cpp/inference/api/analyzer_capi_ner_tester.cc |  2 +-
 test/cpp/inference/infer_ut/README.md              |  2 +-
 test/dygraph_to_static/test_logging_utils.py       |  2 +-
 test/legacy_test/test_lbfgs_class.py               |  8 ++++----
 tools/gen_pybind11_stub.py                         |  2 +-
 10 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/_typos.toml b/_typos.toml
index d5d8360c0e6fe..4a1ffac3d0622 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -41,13 +41,13 @@ caculate = 'caculate'
 calcualtion = 'calcualtion'
 checkings = 'checkings'
 childs = 'childs'
-Continous = 'Continous'
-contibute = 'contibute'
-controled = 'controled'
-contorl = 'contorl'
-converage = 'converage'
-Converage = 'Converage'
-convertion = 'convertion'
+#Continous = 'Continous'
+#contibute = 'contibute'
+#controled = 'controled'
+#contorl = 'contorl'
+#converage = 'converage'
+#Converage = 'Converage'
+#convertion = 'convertion'
 Conver = 'Conver'
 convience = 'convience'
 coodinate = 'coodinate'
diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc
index 5c7ed0afb5af2..8bb819efbd137 100644
--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
@@ -186,6 +186,6 @@ REGISTER_OPERATOR(print,
 
 REGISTER_OP_VERSION(print).AddCheckpoint(
     R"ROC(Upgrade print add a new attribute [print_tensor_layout] to "
-             "contorl whether to print tensor's layout.)ROC",
+             "control whether to print tensor's layout.)ROC",
     paddle::framework::compatible::OpVersionDesc().NewAttr(
         "print_tensor_layout", "Whether to print the tensor's layout.", true));
diff --git a/paddle/phi/backends/onednn/onednn_reuse.h b/paddle/phi/backends/onednn/onednn_reuse.h
index b06990d51c48a..952c2a73b1376 100644
--- a/paddle/phi/backends/onednn/onednn_reuse.h
+++ b/paddle/phi/backends/onednn/onednn_reuse.h
@@ -1166,7 +1166,7 @@ class BinaryOneDNNHandler : public OneDNNHandlerNoCachingT<T, dnnl::binary> {
       float scale_y,
       float scale_out,
       dnnl::post_ops post_ops = dnnl::post_ops{}) {
-    // Scales set in attributes for inputs contibute to the output equation
+    // Scales set in attributes for inputs contribute to the output equation
     // in the following way (assuming no broadcasting takes place):
     // output_i = scale_0 * x_i <+ or *> scale_1 * y_i;
     // Hence we have to create scales that will:
diff --git a/paddle/phi/kernels/funcs/weight_only_gemv.cu b/paddle/phi/kernels/funcs/weight_only_gemv.cu
index f6944a4027260..9ee08f654b65f 100644
--- a/paddle/phi/kernels/funcs/weight_only_gemv.cu
+++ b/paddle/phi/kernels/funcs/weight_only_gemv.cu
@@ -394,7 +394,7 @@ struct WeightLayoutDetails<WeightOnlyQuantType::Int4b> {
   // 20 21 28 29 6 7 14 15 22 23 30 31
   static constexpr int kShuffleSize = 32;
   static constexpr int kShuffleBasicTile = 2;
-  static constexpr int kShuffleContinous = 4;
+  static constexpr int kShuffleContinuous = 4;
   static constexpr int kShuffleStrided = 4;
 
   // The rearrangement here counteracts the effect of
@@ -456,7 +456,7 @@ struct WeightLayoutDetails<WeightOnlyQuantType::Int8b> {
   // 13 14 15 weight 0 1 8 9 2 3 10 11 4 5 12 13 6 7 14 15
   static constexpr int kShuffleSize = 16;
   static constexpr int kShuffleBasicTile = 2;
-  static constexpr int kShuffleContinous = 2;
+  static constexpr int kShuffleContinuous = 2;
   static constexpr int kShuffleStrided = 4;
 
   // The rearrangement here counteracts the effect of
@@ -504,7 +504,7 @@ struct WeightOnlyKernelDetails {
 
   static constexpr int kShuffleSize = Layout::kShuffleSize;
   static constexpr int kShuffleBasicTile = Layout::kShuffleBasicTile;
-  static constexpr int kShuffleContinous = Layout::kShuffleContinous;
+  static constexpr int kShuffleContinuous = Layout::kShuffleContinuous;
   static constexpr int kShuffleStrided = Layout::kShuffleStrided;
 
   // using Converter = typename Layout::Converter;
@@ -848,14 +848,14 @@ struct WeightPostProcessor<T, WeightOnlyQuantType::Int4b, Details> {
                                              int idx) {
     using HALF_2_TYPE = typename CUDA_HALF_2_TYPE_TARIS<T>::type;
 #pragma unroll
-    for (int i = 0; i < Details::kShuffleContinous; ++i) {
+    for (int i = 0; i < Details::kShuffleContinuous; ++i) {
 #pragma unroll
       for (int j = 0; j < Details::kShuffleStrided; ++j) {
         // Dequantize the weights and arrange the shuffled elements back to
         // the correct order in the register array
         HALF_2_TYPE v = *reinterpret_cast<HALF_2_TYPE*>(
             weights_vec + i * Details::kShuffleBasicTile +
-            j * Details::kShuffleContinous * Details::kShuffleBasicTile);
+            j * Details::kShuffleContinuous * Details::kShuffleBasicTile);
         v = HalfMulAdd<HALF_2_TYPE>::apply(
             v,
             ConvertDstFunc_2<HALF_2_TYPE>::apply(scale[idx]),
diff --git a/python/paddle/jit/dy2static/program_translator.py b/python/paddle/jit/dy2static/program_translator.py
index 651c91e048959..dc0c288c27957 100644
--- a/python/paddle/jit/dy2static/program_translator.py
+++ b/python/paddle/jit/dy2static/program_translator.py
@@ -1034,7 +1034,7 @@ def concrete_program_specify_input_spec(
         # if specific the `input_spec`, the length of program_cache will always 1,
         # else, return the last one.
         cached_program_len = len(self._program_cache)
-        # If specific `input_spec`, apply convertion from dygraph layers into static Program.
+        # If specific `input_spec`, apply conversion from dygraph layers into static Program.
         # NOTE(jiabin): is_prim_infer indicates this method called by paddle.jit.save and it is worked in prim mode
 
         desired_input_spec = input_spec
diff --git a/test/cpp/inference/api/analyzer_capi_ner_tester.cc b/test/cpp/inference/api/analyzer_capi_ner_tester.cc
index 561fcb592de25..fbd3b2725dbd9 100644
--- a/test/cpp/inference/api/analyzer_capi_ner_tester.cc
+++ b/test/cpp/inference/api/analyzer_capi_ner_tester.cc
@@ -111,7 +111,7 @@ TEST(PD_ZeroCopyRun, zero_copy_run) {
            "%s",
            PD_GetOutputName(predictor, 0));
 
-  // not necessary, just for converage tests
+  // not necessary, just for coverage tests
   output.lod.data = std::malloc(sizeof(size_t));
 
   PD_GetZeroCopyOutput(predictor, &output);
diff --git a/test/cpp/inference/infer_ut/README.md b/test/cpp/inference/infer_ut/README.md
index 94e2665d7759d..82f5bc7704c49 100644
--- a/test/cpp/inference/infer_ut/README.md
+++ b/test/cpp/inference/infer_ut/README.md
@@ -24,7 +24,7 @@ busybox bash ./run.sh $PADDLE_ROOT $TURN_ON_MKL $TEST_GPU_CPU $DATA_DIR
 - `$TEST_GPU_CPU`: test both GPU/CPU mode or only CPU mode
 - `$DATA_DIR`: download data path
 
-now only support 4 kinds of tests which controled by `--gtest_filter` argument, test suite name should be same as following.
+now only support 4 kinds of tests which controlled by `--gtest_filter` argument, test suite name should be same as following.
 - `TEST(gpu_tester_*, test_name)`
 - `TEST(cpu_tester_*, test_name)`
 - `TEST(mkldnn_tester_*, test_name)`
diff --git a/test/dygraph_to_static/test_logging_utils.py b/test/dygraph_to_static/test_logging_utils.py
index fa34869e92205..e03880a1af722 100644
--- a/test/dygraph_to_static/test_logging_utils.py
+++ b/test/dygraph_to_static/test_logging_utils.py
@@ -87,7 +87,7 @@ def test_set_code_level(self):
             paddle.jit.set_code_level(3.3)
 
     def test_log_api(self):
-        # test api for CI Converage
+        # test api for CI Coverage
         logging_utils.set_verbosity(1, True)
 
         logging_utils.warn("warn")
diff --git a/test/legacy_test/test_lbfgs_class.py b/test/legacy_test/test_lbfgs_class.py
index 631d21962e398..17b2e88587cc0 100644
--- a/test/legacy_test/test_lbfgs_class.py
+++ b/test/legacy_test/test_lbfgs_class.py
@@ -89,7 +89,7 @@ def func(w, x):
             np.testing.assert_allclose(net.w, weight, rtol=1e-05)
 
     def test_inf_minima_incubate(self):
-        # not converage
+        # not converge
         input = np.random.rand(1).astype(np.float32)
 
         def outputs1(x):
@@ -169,7 +169,7 @@ def error_func1():
         self.assertRaises(TypeError, error_func1)
 
     def test_error2_incubate(self):
-        # not converage
+        # not converge
         input = np.random.rand(1).astype(np.float32)
 
         def outputs2(x):
@@ -339,7 +339,7 @@ def func(w, x):
             np.testing.assert_allclose(net.w, weight, rtol=1e-05)
 
     def test_inf_minima(self):
-        # not converage
+        # not converge
         input = np.random.rand(1).astype(np.float32)
 
         def outputs1(x):
@@ -419,7 +419,7 @@ def error_func1():
         self.assertRaises(TypeError, error_func1)
 
     def test_error2(self):
-        # not converage
+        # not converge
         input = np.random.rand(1).astype(np.float32)
 
         def outputs2(x):
diff --git a/tools/gen_pybind11_stub.py b/tools/gen_pybind11_stub.py
index 116cdcbdee1cb..3e8dfe839f14c 100644
--- a/tools/gen_pybind11_stub.py
+++ b/tools/gen_pybind11_stub.py
@@ -71,7 +71,7 @@
 # ref:
 # - https://pybind11.readthedocs.io/en/latest/advanced/misc.html#avoiding-cpp-types-in-docstrings
 # - https://pybind11.readthedocs.io/en/latest/advanced/functions.html#default-arguments-revisited
-# we can add some mappings for convertion, e.g. {'paddle::Tensor': 'paddle.Tensor'}
+# we can add some mappings for conversion, e.g. {'paddle::Tensor': 'paddle.Tensor'}
 PYBIND11_ATTR_MAPPING = {}
 
 # some bad full expression pybind11-stubgen can not catch as invalid exp

From 1ae4b0b7dcd988e93716a4eb1de2b56c6f7f9586 Mon Sep 17 00:00:00 2001
From: rich04lin <152049331+rich04lin@users.noreply.github.com>
Date: Sun, 15 Dec 2024 17:28:04 +0800
Subject: [PATCH 8/9] c53-58

---
 _typos.toml | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/_typos.toml b/_typos.toml
index 4a1ffac3d0622..e40d20c3ca14b 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -41,13 +41,6 @@ caculate = 'caculate'
 calcualtion = 'calcualtion'
 checkings = 'checkings'
 childs = 'childs'
-#Continous = 'Continous'
-#contibute = 'contibute'
-#controled = 'controled'
-#contorl = 'contorl'
-#converage = 'converage'
-#Converage = 'Converage'
-#convertion = 'convertion'
 Conver = 'Conver'
 convience = 'convience'
 coodinate = 'coodinate'

From d63e24de9cba9b7d769bfcd664d87d96efbfeef2 Mon Sep 17 00:00:00 2001
From: rich04lin <152049331+rich04lin@users.noreply.github.com>
Date: Mon, 16 Dec 2024 23:40:17 +0800
Subject: [PATCH 9/9] c59

---
 _typos.toml                                   |   1 -
 .../decomp_rule/decomp_rule/composite.h       |  90 ++++++-------
 .../decomp_rule/decomp_vjp/details.h          | 118 +++++++++---------
 .../primitive/decomp_utils/decomp_utils.h     |   4 +-
 4 files changed, 106 insertions(+), 107 deletions(-)

diff --git a/_typos.toml b/_typos.toml
index e40d20c3ca14b..7e0f61f5f6dfc 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -41,7 +41,6 @@ caculate = 'caculate'
 calcualtion = 'calcualtion'
 checkings = 'checkings'
 childs = 'childs'
-Conver = 'Conver'
 convience = 'convience'
 coodinate = 'coodinate'
 copyed = 'copyed'
diff --git a/paddle/fluid/primitive/decomp_rule/decomp_rule/composite.h b/paddle/fluid/primitive/decomp_rule/decomp_rule/composite.h
index 9e6aef48307d2..58d630f7caa78 100644
--- a/paddle/fluid/primitive/decomp_rule/decomp_rule/composite.h
+++ b/paddle/fluid/primitive/decomp_rule/decomp_rule/composite.h
@@ -36,7 +36,7 @@ Tensor any_decomp(const Tensor& x, const IntArray& axis, bool keepdim) {
 
 template <typename T>
 Tensor mean_decomp(const Tensor& x, const IntArray& axis, bool keepdim) {
-  auto x_tmp = ConverToMT<T>(x);
+  auto x_tmp = ConvertToMT<T>(x);
 
   std::vector<int64_t> x_dim = x_tmp.shape();
   int64_t axis_size = axis.size();
@@ -82,7 +82,7 @@ Tensor mean_decomp(const Tensor& x, const IntArray& axis, bool keepdim) {
 
   Tensor res = sum_x / value;
 
-  return ConverToOrig<T>(res, x.dtype());
+  return ConvertToOrig<T>(res, x.dtype());
 }
 
 static void check_valid_type(const DataType& dtype) {
@@ -112,7 +112,7 @@ Tensor p_norm_decomp(const Tensor& x,
                      const float epsilon = 1.0e-12f,
                      const bool& keepdim = false,
                      const bool& asvector = false) {
-  auto x_tmp = ConverToMT<T>(x);
+  auto x_tmp = ConvertToMT<T>(x);
 
   Tensor res;
   if (porder == 0.0) {
@@ -146,17 +146,17 @@ Tensor p_norm_decomp(const Tensor& x,
     res = elementwise_pow<T>(res, inv_porder_tensor);
   }
 
-  return ConverToOrig<T>(res, x.dtype());
+  return ConvertToOrig<T>(res, x.dtype());
 }
 
 template <typename T>
 Tensor pow_decomp(const Tensor& x, const paddle::Scalar& y) {
-  auto x_cast = ConverToMT<T>(x);
+  auto x_cast = ConvertToMT<T>(x);
 
   check_valid_type(y.dtype());
   Tensor y_full = full_scalar<T>(y, x_cast.dtype(), x_cast.place());
   auto ans = elementwise_pow<T>(x_cast, y_full);
-  return ConverToOrig<T>(ans, x.dtype());
+  return ConvertToOrig<T>(ans, x.dtype());
 }
 
 template <typename T>
@@ -263,7 +263,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> batch_norm_decomp(
     bool use_global_stats,
     bool trainable_statistics) {
   auto org_dtype = x.dtype();
-  Tensor x_cast = ConverToMT<T>(x);
+  Tensor x_cast = ConvertToMT<T>(x);
 
   BatchNormDecompHelper<T> decomp_help(x, scale, bias, data_layout);
 
@@ -319,7 +319,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> batch_norm_decomp(
                  : bias.get());
   }
 
-  y = ConverToOrig<T>(y, org_dtype);
+  y = ConvertToOrig<T>(y, org_dtype);
 
   if (!use_run_stat) {
     batch_mean_ = squeeze<T>(batch_mean, reduce_axes);
@@ -336,25 +336,25 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> batch_norm_decomp(
 
 template <typename T>
 Tensor softmax_decomp(const Tensor& x, const int& axis) {
-  auto x_tmp = ConverToMT<T>(x);
+  auto x_tmp = ConvertToMT<T>(x);
 
   auto max_tmp = max<T>(x_tmp, {axis}, true);
   auto molecular = exp<T>(x_tmp - max_tmp);
   auto res = molecular / sum<T>(molecular, {axis}, molecular.dtype(), true);
 
-  return ConverToOrig<T>(res, x.dtype());
+  return ConvertToOrig<T>(res, x.dtype());
 }
 
 template <typename T>
 Tensor log_softmax_decomp(const Tensor& x, const int& axis) {
-  auto x_tmp = ConverToMT<T>(x);
+  auto x_tmp = ConvertToMT<T>(x);
 
   auto max_tmp = max<T>(x_tmp, {axis}, true);
   auto sub = x_tmp - max_tmp;
   auto molecular = exp<T>(sub);
   auto res = sub - log<T>(sum<T>(molecular, {axis}, molecular.dtype(), true));
 
-  return ConverToOrig<T>(res, x.dtype());
+  return ConvertToOrig<T>(res, x.dtype());
 }
 
 template <typename T>
@@ -411,9 +411,9 @@ Tensor stack_decomp(const std::vector<Tensor>& x, const int& axis) {
 
 template <typename T>
 Tensor silu_decomp(const Tensor& x) {
-  auto x_tmp = ConverToMT<T>(x);
+  auto x_tmp = ConvertToMT<T>(x);
   auto res = x_tmp * sigmoid<T>(x_tmp);
-  return ConverToOrig<T>(res, x.dtype());
+  return ConvertToOrig<T>(res, x.dtype());
 }
 
 template <typename T>
@@ -541,7 +541,7 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_decomp(
     int begin_norm_axis) {
   std::vector<int64_t> reduce_axis;
   auto org_dtype = x.dtype();
-  Tensor x_cast = ConverToMT<T>(x);
+  Tensor x_cast = ConvertToMT<T>(x);
 
   auto x_dims = x.dims();
 
@@ -562,13 +562,13 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_decomp(
   Tensor scale_cast;
   if (scale) {
     scale_cast = decomp_helper.Process<T>(scale.get(), x_cast);
-    scale_cast = ConverToMT<T>(scale_cast);
+    scale_cast = ConvertToMT<T>(scale_cast);
     out = out * scale_cast;
   }
   Tensor bias_cast;
   if (bias) {
     bias_cast = decomp_helper.Process<T>(bias.get(), x_cast);
-    bias_cast = ConverToMT<T>(bias_cast);
+    bias_cast = ConvertToMT<T>(bias_cast);
     out = out + bias_cast;
   }
   mean_ = squeeze<T>(mean_, reduce_axis);
@@ -577,7 +577,7 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_decomp(
   // same as LayerNormInferMeta
   // x: float32 --> out: float32, mean: float32, variance: float32
   // x: float16 --> out: float16, mean: float32, variance: float32
-  out = ConverToOrig<T>(out, org_dtype);
+  out = ConvertToOrig<T>(out, org_dtype);
   return std::make_tuple(out, mean_, variance);
 }
 
@@ -751,7 +751,7 @@ std::tuple<Tensor, Tensor, Tensor> instance_norm_decomp(
     const paddle::optional<Tensor>& bias,
     float epsilon) {
   auto org_dtype = x.dtype();
-  Tensor x_cast = ConverToMT<T>(x);
+  Tensor x_cast = ConvertToMT<T>(x);
   const std::vector<int64_t> x_dims = x.shape();
 
   if (has_dynamic_shape(x_dims)) {
@@ -790,20 +790,20 @@ std::tuple<Tensor, Tensor, Tensor> instance_norm_decomp(
 
     if (scale) {
       auto scale_cast = backend::reshape<T>(scale.get(), slice_shape_tensor);
-      scale_cast = ConverToMT<T>(scale_cast);
+      scale_cast = ConvertToMT<T>(scale_cast);
       out = out * scale_cast;
     }
 
     if (bias) {
       auto bias_cast = backend::reshape<T>(bias.get(), slice_shape_tensor);
-      bias_cast = ConverToMT<T>(bias_cast);
+      bias_cast = ConvertToMT<T>(bias_cast);
       out = out + bias_cast;
     }
 
     std::vector<int64_t> res_shape(1, -1);
     auto mean_out = reshape<T>(mean_, res_shape);
     auto variance_out = reshape<T>(rsqrt_var, res_shape);
-    auto res = ConverToOrig<T>(out, org_dtype);
+    auto res = ConvertToOrig<T>(out, org_dtype);
 
     return std::make_tuple(res, mean_out, variance_out);
   }
@@ -830,20 +830,20 @@ std::tuple<Tensor, Tensor, Tensor> instance_norm_decomp(
   out = reshape<T>(out, x_dims);
   if (scale) {
     auto scale_cast = reshape<T>(scale.get(), slice_shape);
-    scale_cast = ConverToMT<T>(scale_cast);
+    scale_cast = ConvertToMT<T>(scale_cast);
     out = out * scale_cast;
   }
 
   if (bias) {
     auto bias_cast = reshape<T>(bias.get(), slice_shape);
-    bias_cast = ConverToMT<T>(bias_cast);
+    bias_cast = ConvertToMT<T>(bias_cast);
     out = out + bias_cast;
   }
 
   std::vector<int64_t> res_shape(1, -1);
   auto mean_out = reshape<T>(mean_, res_shape);
   auto variance_out = reshape<T>(rsqrt_var, res_shape);
-  auto res = ConverToOrig<T>(out, org_dtype);
+  auto res = ConvertToOrig<T>(out, org_dtype);
 
   return std::make_tuple(res, mean_out, variance_out);
 }
@@ -985,7 +985,7 @@ std::tuple<Tensor, Tensor, Tensor> group_norm_decomp(
   }
 
   auto org_dtype = x.dtype();
-  Tensor x_cast = ConverToMT<T>(x);
+  Tensor x_cast = ConvertToMT<T>(x);
 
   Tensor x_dim_t;
   Tensor out, mean_, var_;
@@ -1047,7 +1047,7 @@ std::tuple<Tensor, Tensor, Tensor> group_norm_decomp(
     } else {
       scale_cast = scale.get();
     }
-    scale_cast = ConverToMT<T>(scale_cast);
+    scale_cast = ConvertToMT<T>(scale_cast);
     out = out * scale_cast;
   }
   Tensor bias_cast;
@@ -1057,7 +1057,7 @@ std::tuple<Tensor, Tensor, Tensor> group_norm_decomp(
     } else {
       bias_cast = bias.get();
     }
-    bias_cast = ConverToMT<T>(bias_cast);
+    bias_cast = ConvertToMT<T>(bias_cast);
     out = out + bias_cast;
   }
   Tensor mean_out, var_out;
@@ -1072,20 +1072,20 @@ std::tuple<Tensor, Tensor, Tensor> group_norm_decomp(
     mean_out = reshape<T>(mean_, res_shape);
     var_out = reshape<T>(var_, res_shape);
   }
-  out = ConverToOrig<T>(out, org_dtype);
+  out = ConvertToOrig<T>(out, org_dtype);
 
   return std::make_tuple(out, mean_out, var_out);
 }
 
 template <typename T>
 Tensor square_decomp(const Tensor& x) {
-  auto x_cast = ConverToMT<T>(x);
+  auto x_cast = ConvertToMT<T>(x);
 
   Tensor two;
   two = full_scalar<T>(2, x_cast.dtype(), x_cast.place());
 
   auto ans = elementwise_pow<T>(x_cast, two);
-  return ConverToOrig<T>(ans, x.dtype());
+  return ConvertToOrig<T>(ans, x.dtype());
 }
 
 template <typename T>
@@ -1131,7 +1131,7 @@ Tensor sigmoid_cross_entropy_with_logits_decomp(
 
 template <typename T>
 Tensor mean_all_decomp(const Tensor& x) {
-  auto x_cast = ConverToMT<T>(x);
+  auto x_cast = ConvertToMT<T>(x);
   auto x_shape = x.shape();
 
   Tensor ans;
@@ -1147,7 +1147,7 @@ Tensor mean_all_decomp(const Tensor& x) {
     ans = sum<T>(x_cast) / x_cast.numel();
   }
 
-  return ConverToOrig<T>(ans, x.dtype());
+  return ConvertToOrig<T>(ans, x.dtype());
 }
 
 template <typename T>
@@ -1243,7 +1243,7 @@ Tensor index_sample_decomp(const Tensor& x, const Tensor& index) {
 
 template <typename T>
 Tensor elu_decomp(const Tensor& x, const float alpha) {
-  auto x_cast = ConverToMT<T>(x);
+  auto x_cast = ConvertToMT<T>(x);
 
   Tensor zero;
   Tensor tmp_res;
@@ -1258,16 +1258,16 @@ Tensor elu_decomp(const Tensor& x, const float alpha) {
     tmp_res = alpha * (exp<T>(x_cast) - 1);
   }
   auto ans = where<T>(x_cast > zero, x_cast, tmp_res);
-  return ConverToOrig<T>(ans, x.dtype());
+  return ConvertToOrig<T>(ans, x.dtype());
 }
 
 template <typename T>
 Tensor lerp_decomp(const Tensor& x, const Tensor& y, const Tensor& weight) {
-  Tensor x_cast = ConverToMT<T>(x);
-  Tensor y_cast = ConverToMT<T>(y);
-  Tensor weight_cast = ConverToMT<T>(weight);
+  Tensor x_cast = ConvertToMT<T>(x);
+  Tensor y_cast = ConvertToMT<T>(y);
+  Tensor weight_cast = ConvertToMT<T>(weight);
   Tensor res = x_cast + weight_cast * (y_cast - x_cast);
-  return ConverToOrig<T>(res, x.dtype());
+  return ConvertToOrig<T>(res, x.dtype());
 }
 
 template <typename T>
@@ -1420,9 +1420,9 @@ Tensor eye_decomp(const paddle::Scalar& num_rows,
   int32_t min_num = std::min(num_rows.to<int>(), num_columns.to<int>());
   Tensor zero_tensor =
       full<T>({num_rows.to<int>(), num_columns.to<int>()}, 0, dtype, place);
-  auto zero_tensor_cast = ConverToMT<T>(zero_tensor);
+  auto zero_tensor_cast = ConvertToMT<T>(zero_tensor);
   Tensor diag_one = unsqueeze<T>(full<T>({min_num}, 1, dtype, place), {1});
-  auto diag_one_cast = ConverToMT<T>(diag_one);
+  auto diag_one_cast = ConvertToMT<T>(diag_one);
 
   auto start = full<T>({1}, 0, dtype, place);
   auto stop = full<T>({1}, min_num, dtype, place);
@@ -1430,17 +1430,17 @@ Tensor eye_decomp(const paddle::Scalar& num_rows,
   Tensor index = unsqueeze<T>(
       backend::arange<T>(start, stop, step, DataType::INT32, place), {1});
 
-  auto index_cast = ConverToMT<T>(index);
+  auto index_cast = ConvertToMT<T>(index);
   Tensor res = put_along_axis<T>(zero_tensor_cast, index, diag_one_cast, 1);
 
-  return ConverToOrig<T>(res, dtype);
+  return ConvertToOrig<T>(res, dtype);
 }
 
 template <typename T>
 Tensor diag_decomp(const Tensor& x,
                    const int& offset = 0,
                    const float& padding_value = 0.0) {
-  Tensor cast_x = ConverToMT<T>(x);
+  Tensor cast_x = ConvertToMT<T>(x);
   int64_t rank = cast_x.dims().size();
   Tensor res;
   if (rank == 1) {
@@ -1482,7 +1482,7 @@ Tensor diag_decomp(const Tensor& x,
         backend::arange<T>(start, end, stride, DataType::INT64, cast_x.place());
     res = take_along_axis<T>(x_flat, indices, 0);
   }
-  return ConverToOrig<T>(res, x.dtype());
+  return ConvertToOrig<T>(res, x.dtype());
 }
 
 }  // namespace details
diff --git a/paddle/fluid/primitive/decomp_rule/decomp_vjp/details.h b/paddle/fluid/primitive/decomp_rule/decomp_vjp/details.h
index b188c517c20a4..61c27a54361b5 100644
--- a/paddle/fluid/primitive/decomp_rule/decomp_vjp/details.h
+++ b/paddle/fluid/primitive/decomp_rule/decomp_vjp/details.h
@@ -52,12 +52,12 @@ void bce_loss_grad(const Tensor& input,
                    Tensor* input_grad) {
   using MT = typename phi::dtype::MPTypeTrait<T>::Type;
   if (input_grad) {
-    auto input_mt = ConverToMT<MT>(input);
+    auto input_mt = ConvertToMT<MT>(input);
     auto term = maximum<MT>((1 - input_mt) * input_mt,
                             full_scalar<MT>(1e-12, input_mt.dtype()));
     auto out_base =
-        ConverToMT<MT>(out_grad) * (input_mt - ConverToMT<MT>(label)) / term;
-    set_output<T>(ConverToOrig<T>(out_base, input.dtype()), input_grad);
+        ConvertToMT<MT>(out_grad) * (input_mt - ConvertToMT<MT>(label)) / term;
+    set_output<T>(ConvertToOrig<T>(out_base, input.dtype()), input_grad);
   }
 }
 
@@ -324,8 +324,8 @@ void gelu_grad(const Tensor& x,
   // Automatically promote to fp32 when the input type is fp16 for keeping
   // consistent with phi kernel
 
-  auto promoted_x = ConverToMT<T>(x);
-  auto promoted_out_grad = ConverToMT<T>(out_grad);
+  auto promoted_x = ConvertToMT<T>(x);
+  auto promoted_out_grad = ConvertToMT<T>(out_grad);
   if (approximate) {
     float kbeta = M_SQRT2 * M_2_SQRTPI * 0.5;
     float kkappa = 0.044715;
@@ -347,7 +347,7 @@ void gelu_grad(const Tensor& x,
     auto right_derivative = left * tanh_derivative * inner_derivative;
 
     set_output<T>(
-        ConverToOrig<T>(
+        ConvertToOrig<T>(
             promoted_out_grad * (left_derivative + right_derivative), x.type()),
         x_grad);
   } else {
@@ -358,9 +358,9 @@ void gelu_grad(const Tensor& x,
 
     auto cdf = scale<T>(scale<T>(erf<T>(kalpha_ * promoted_x), 1., 1.), 0.5);
     auto pdf = kbeta_ * exp<T>(scale<T>(promoted_x * promoted_x, -0.5));
-    set_output<T>(
-        ConverToOrig<T>(promoted_out_grad * (cdf + promoted_x * pdf), x.type()),
-        x_grad);
+    set_output<T>(ConvertToOrig<T>(promoted_out_grad * (cdf + promoted_x * pdf),
+                                   x.type()),
+                  x_grad);
   }
 }
 
@@ -849,7 +849,7 @@ void layer_norm_grad(const Tensor& x,
   auto mean_ = reshape<T>(mean, mean_var_new_shape);
   auto variance_ = reshape<T>(variance, mean_var_new_shape);
 
-  auto x_cast = ConverToMT<T>(x);
+  auto x_cast = ConvertToMT<T>(x);
   Tensor scale_cast;
   if (scale_ptr) {
     scale_cast = decomp_help.Process<T>(*scale_ptr, x_cast);
@@ -857,9 +857,9 @@ void layer_norm_grad(const Tensor& x,
 
   // cast dtype to float32 if dtype =float16 or bfloat16
 
-  auto out_grad_cast = ConverToMT<T>(out_grad);
+  auto out_grad_cast = ConvertToMT<T>(out_grad);
   if (scale_ptr) {
-    scale_cast = ConverToMT<T>(scale_cast);
+    scale_cast = ConvertToMT<T>(scale_cast);
   }
 
   auto x_sub_mean = x_cast - mean_;  // M,N
@@ -885,7 +885,7 @@ void layer_norm_grad(const Tensor& x,
         (d_mean + d_std) / decomp_help.GetNormlizedNumel<T>(d_std);
 
     auto x_grad_tmp = dx_end - d_mean_d_std;
-    x_grad_tmp = ConverToOrig<T>(x_grad_tmp, x.dtype());
+    x_grad_tmp = ConvertToOrig<T>(x_grad_tmp, x.dtype());
 
     set_output<T>(x_grad_tmp, x_grad);
   }
@@ -895,7 +895,7 @@ void layer_norm_grad(const Tensor& x,
       auto scale_grad_tmp = (x_sub_mean_mul_sqrt_var_1 * out_grad_cast)
                                 .sum(un_normlized_axis, x_cast.dtype(), true);
       scale_grad_tmp = reshape<T>(scale_grad_tmp, {-1});
-      scale_grad_tmp = ConverToOrig<T>(scale_grad_tmp, scale_ptr->dtype());
+      scale_grad_tmp = ConvertToOrig<T>(scale_grad_tmp, scale_ptr->dtype());
 
       set_output<T>(scale_grad_tmp, scale_grad);
     } else {
@@ -908,7 +908,7 @@ void layer_norm_grad(const Tensor& x,
       auto bias_grad_tmp =
           out_grad_cast.sum(un_normlized_axis, x_cast.dtype(), true);
       bias_grad_tmp = reshape<T>(bias_grad_tmp, {-1});
-      bias_grad_tmp = ConverToOrig<T>(bias_grad_tmp, bias_ptr->dtype());
+      bias_grad_tmp = ConvertToOrig<T>(bias_grad_tmp, bias_ptr->dtype());
 
       set_output<T>(bias_grad_tmp, bias_grad);
     } else {
@@ -1007,11 +1007,11 @@ void square_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {
 template <typename T>
 void exp_grad(const Tensor& out, const Tensor& out_grad, Tensor* x_grad) {
   if (x_grad) {
-    Tensor out_promote = ConverToMT<T>(out);
-    Tensor out_grad_promote = ConverToMT<T>(out_grad);
+    Tensor out_promote = ConvertToMT<T>(out);
+    Tensor out_grad_promote = ConvertToMT<T>(out_grad);
 
     auto x_grad_tmp = out_promote * out_grad_promote;
-    set_output<T>(ConverToOrig<T>(x_grad_tmp, out.dtype()), x_grad);
+    set_output<T>(ConvertToOrig<T>(x_grad_tmp, out.dtype()), x_grad);
   }
 }
 
@@ -1043,11 +1043,11 @@ void silu_grad(const Tensor& x,
   if (x_grad) {
     auto one = full_scalar<T>(1.0, x.dtype());
 
-    auto x_cast = ConverToMT<T>(x);
-    auto out_cast = ConverToMT<T>(out);
-    auto out_grad_cast = ConverToMT<T>(out_grad);
+    auto x_cast = ConvertToMT<T>(x);
+    auto out_cast = ConvertToMT<T>(out);
+    auto out_grad_cast = ConvertToMT<T>(out_grad);
     auto res = out_grad_cast * sigmoid<T>(x_cast) * (one + x_cast - out_cast);
-    set_output<T>(ConverToOrig<T>(res, x.dtype()), x_grad);
+    set_output<T>(ConvertToOrig<T>(res, x.dtype()), x_grad);
   }
 }
 
@@ -1240,8 +1240,8 @@ void masked_select_grad(const Tensor& x,
                         const Tensor& out_grad,
                         Tensor* x_grad) {
   if (x_grad) {
-    auto promoted_x = ConverToMT<T>(x);
-    auto promoted_out_grad = ConverToMT<T>(out_grad);
+    auto promoted_x = ConvertToMT<T>(x);
+    auto promoted_out_grad = ConvertToMT<T>(out_grad);
 
     auto x_num = 1;
     for (size_t i = 0; i < promoted_x.shape().size(); i++) {
@@ -1406,14 +1406,14 @@ void instance_norm_grad(const Tensor& x,
   std::vector<int64_t> n_reduce_axes = decomp_helper.GetNPlusReduceAxis();
   Tensor hw = decomp_helper.GetHW(x);
 
-  auto promoted_y_grad = ConverToMT<T>(y_grad);
+  auto promoted_y_grad = ConvertToMT<T>(y_grad);
 
   Tensor x_hat;
   Tensor std_inv;
   if (scale_grad || x_grad) {
-    auto promoted_x = ConverToMT<T>(x);
-    auto promoted_saved_mean = ConverToMT<T>(saved_mean);
-    auto promoted_saved_var = ConverToMT<T>(saved_variance);
+    auto promoted_x = ConvertToMT<T>(x);
+    auto promoted_saved_mean = ConvertToMT<T>(saved_mean);
+    auto promoted_saved_var = ConvertToMT<T>(saved_variance);
 
     std::vector<int64_t> mean_new_shape{n, c};
     for (size_t i = 0; i < reduce_axes.size(); ++i) {
@@ -1433,7 +1433,7 @@ void instance_norm_grad(const Tensor& x,
                         : full<T>(IntArray({c}), 1., x.dtype(), x.place());
     auto unsqueeze_shape = get_unsqueeze_dims(scale_data_tensor, n_reduce_axes);
     auto scale_data = reshape<T>(scale_data_tensor, unsqueeze_shape);
-    auto promoted_scale = ConverToMT<T>(scale_data);
+    auto promoted_scale = ConvertToMT<T>(scale_data);
     auto tmp1 =
         is_reduce_empty
             ? promoted_y_grad
@@ -1444,19 +1444,19 @@ void instance_norm_grad(const Tensor& x,
                           .sum(reduce_axes, promoted_y_grad.dtype(), true);
     auto result = (promoted_scale * std_inv) *
                   (promoted_y_grad - tmp1 / hw - (x_hat * tmp2 / hw));
-    set_output<T>(ConverToOrig<T>(result, x.dtype()), x_grad);
+    set_output<T>(ConvertToOrig<T>(result, x.dtype()), x_grad);
   }
   // scale_grad = x_hat * y_grad.sum(n, h, w)
   if (scale_grad) {
     auto result = (promoted_y_grad * x_hat).sum(n_reduce_axes);
     auto scale_dtype = scale.get_ptr() ? scale.get().dtype() : x.dtype();
-    set_output<T>(ConverToOrig<T>(result, scale_dtype), scale_grad);
+    set_output<T>(ConvertToOrig<T>(result, scale_dtype), scale_grad);
   }
   // d_bias = y_grad.sum(n, h, w)
   if (bias_grad) {
     auto result = promoted_y_grad.sum(n_reduce_axes);
     auto scale_dtype = scale.get_ptr() ? scale.get().dtype() : x.dtype();
-    set_output<T>(ConverToOrig<T>(result, scale_dtype), bias_grad);
+    set_output<T>(ConvertToOrig<T>(result, scale_dtype), bias_grad);
   }
 }
 
@@ -1938,8 +1938,8 @@ void batch_norm_grad(const Tensor& x,
                      Tensor* bias_grad) {
   use_global_stats = is_test || use_global_stats;
 
-  Tensor x_data = ConverToMT<T>(x);
-  Tensor out_grad_data = ConverToMT<T>(out_grad);
+  Tensor x_data = ConvertToMT<T>(x);
+  Tensor out_grad_data = ConvertToMT<T>(out_grad);
 
   Tensor mean_data;
   Tensor rsqrt_var;
@@ -1975,7 +1975,7 @@ void batch_norm_grad(const Tensor& x,
         x_grad_data =
             reshape<T>(scale.get(), scale_bias_new_shape) * x_grad_data;
       }
-      x_grad_data = ConverToOrig<T>(x_grad_data, x.dtype());
+      x_grad_data = ConvertToOrig<T>(x_grad_data, x.dtype());
       set_output<T>(x_grad_data, x_grad);
     } else {
       auto part1 = rsqrt_var;
@@ -1990,7 +1990,7 @@ void batch_norm_grad(const Tensor& x,
           out_grad_data - mean_temp1 - (x_data - mean_data) * mean_temp2;
 
       auto x_grad_data = part1 * part2;
-      x_grad_data = ConverToOrig<T>(x_grad_data, x.dtype());
+      x_grad_data = ConvertToOrig<T>(x_grad_data, x.dtype());
       set_output<T>(x_grad_data, x_grad);
     }
     if (scale_grad) {
@@ -2313,8 +2313,8 @@ void group_norm_grad(const Tensor& x,
 
   int g_num = C / groups;
 
-  Tensor x_data = ConverToMT<T>(x);
-  Tensor out_grad_data = ConverToMT<T>(out_grad);
+  Tensor x_data = ConvertToMT<T>(x);
+  Tensor out_grad_data = ConvertToMT<T>(out_grad);
 
   auto shape_group = std::vector<int64_t>({N, groups, g_num});
 
@@ -2348,7 +2348,7 @@ void group_norm_grad(const Tensor& x,
     Tensor d2;
     Tensor p1;
     if (scale) {
-      scale_data = ConverToMT<T>(scale_data);
+      scale_data = ConvertToMT<T>(scale_data);
 
       d1 = (reshape<T>(sum_y_grad_mul_x * scale_data, shape_group))
                .sum(std::vector<int64_t>({2}), dtype, false);
@@ -2383,7 +2383,7 @@ void group_norm_grad(const Tensor& x,
     auto tmp_2 = reshape<T>(x_data, whole_group_shape) * p2 + p3;
     auto x_grad_data = tmp_1 + tmp_2;
     x_grad_data = reshape<T>(x_grad_data, x.shape());
-    x_grad_data = ConverToOrig<T>(x_grad_data, x.dtype());
+    x_grad_data = ConvertToOrig<T>(x_grad_data, x.dtype());
 
     set_output<T>(x_grad_data, x_grad);
   }
@@ -2782,9 +2782,9 @@ void logcumsumexp_grad(const Tensor& x,
   if (x_grad) {
     reverse = !reverse;
     Tensor tmp, lowest, x_grad_tmp;
-    Tensor x_cast = ConverToMT<T>(x);
-    Tensor out_cast = ConverToMT<T>(out);
-    Tensor out_grad_cast = ConverToMT<T>(out_grad);
+    Tensor x_cast = ConvertToMT<T>(x);
+    Tensor out_cast = ConvertToMT<T>(out);
+    Tensor out_grad_cast = ConvertToMT<T>(out_grad);
 
     const Tensor out_grad_log = log<T>(abs<T>(out_grad_cast));
     auto out_grad_dtype = out_grad_cast.dtype();
@@ -2859,7 +2859,7 @@ void logcumsumexp_grad(const Tensor& x,
       x_grad_tmp = reshape<T>(out_grad_pos - out_grad_neg, x_cast.shape());
     }
 
-    set_output<T>(ConverToOrig<T>(x_grad_tmp, x.dtype()), x_grad);
+    set_output<T>(ConvertToOrig<T>(x_grad_tmp, x.dtype()), x_grad);
   }
 }
 
@@ -2973,8 +2973,8 @@ void kthvalue_grad(const Tensor& x,
                    bool keepdim,
                    Tensor* x_grad) {
   if (x_grad) {
-    auto x_cast = ConverToMT<T>(x);
-    auto out_grad_cast = ConverToMT<T>(out_grad);
+    auto x_cast = ConvertToMT<T>(x);
+    auto out_grad_cast = ConvertToMT<T>(out_grad);
     // put_along_axis doesn't support zero dim
     if (x.dims().size() == 0) {
       by_pass<T>(out_grad, x_grad);
@@ -3020,7 +3020,7 @@ void kthvalue_grad(const Tensor& x,
         x_grad_tmp = put_along_axis<T>(zero_tensor, indices_, out_grad_, axis);
       }
     }
-    set_output<T>(ConverToOrig<T>(x_grad_tmp, x.dtype()), x_grad);
+    set_output<T>(ConvertToOrig<T>(x_grad_tmp, x.dtype()), x_grad);
   }
 }
 
@@ -3033,9 +3033,9 @@ void argsort_grad(const Tensor& indices,
                   bool stable,
                   Tensor* x_grad) {
   if (x_grad) {
-    auto indices_cast = ConverToMT<T>(indices);
-    auto x_cast = ConverToMT<T>(x);
-    auto out_grad_cast = ConverToMT<T>(out_grad);
+    auto indices_cast = ConvertToMT<T>(indices);
+    auto x_cast = ConvertToMT<T>(x);
+    auto out_grad_cast = ConvertToMT<T>(out_grad);
 
     if (axis < 0) {
       axis += x_cast.dims().size();
@@ -3052,7 +3052,7 @@ void argsort_grad(const Tensor& indices,
     x_grad_tmp =
         put_along_axis<T>(zero_tensor, indices_cast, out_grad_cast, axis);
 
-    set_output<T>(ConverToOrig<T>(x_grad_tmp, x.dtype()), x_grad);
+    set_output<T>(ConvertToOrig<T>(x_grad_tmp, x.dtype()), x_grad);
   }
 }
 
@@ -3200,8 +3200,8 @@ void kron_grad(const Tensor& x,
   }
   if (y_grad) {
     Tensor zero = full<T>({1}, 0, DataType::INT32, y.place());
-    auto x_cast = ConverToMT<T>(x);
-    auto out_grad_cast = ConverToMT<T>(out_grad);
+    auto x_cast = ConvertToMT<T>(x);
+    auto out_grad_cast = ConvertToMT<T>(out_grad);
     Tensor out_grad_tmp;
     Tensor y_grad_tmp;
 
@@ -3279,7 +3279,7 @@ void kron_grad(const Tensor& x,
         }
       }
       y_grad_tmp = backend::reshape<T>(
-          ConverToOrig<T>(out_grad_tmp, out_grad.dtype()), shape64<T>(y));
+          ConvertToOrig<T>(out_grad_tmp, out_grad.dtype()), shape64<T>(y));
     } else {
       auto x_shape = x_cast.shape();
       auto y_shape = y.shape();
@@ -3305,7 +3305,7 @@ void kron_grad(const Tensor& x,
 
       tile_grad<T>(y_, out_grad_tmp, IntArray(x_dim), &y_grad_tmp);
       y_grad_tmp =
-          reshape<T>(ConverToOrig<T>(y_grad_tmp, y.dtype()), y.shape());
+          reshape<T>(ConvertToOrig<T>(y_grad_tmp, y.dtype()), y.shape());
     }
     set_output<T>(y_grad_tmp, y_grad);
   }
@@ -3318,11 +3318,11 @@ void take_along_axis_grad(const Tensor& arr,
                           int axis,
                           Tensor* arr_grad) {
   if (arr_grad) {
-    auto arr_cast = ConverToMT<T>(arr);
-    auto out_grad_cast = ConverToMT<T>(out_grad);
+    auto arr_cast = ConvertToMT<T>(arr);
+    auto out_grad_cast = ConvertToMT<T>(out_grad);
     // put_along_axis doesn't support zero dim
     if (arr_cast.dims().size() == 0) {
-      by_pass<T>(ConverToOrig<T>(out_grad_cast, out_grad.dtype()), arr_grad);
+      by_pass<T>(ConvertToOrig<T>(out_grad_cast, out_grad.dtype()), arr_grad);
       return;
     }
 
@@ -3343,7 +3343,7 @@ void take_along_axis_grad(const Tensor& arr,
     }
     auto arr_grad_tmp =
         put_along_axis<T>(zero_tensor, indices, out_grad_cast, axis);
-    set_output<T>(ConverToOrig<T>(arr_grad_tmp, arr.dtype()), arr_grad);
+    set_output<T>(ConvertToOrig<T>(arr_grad_tmp, arr.dtype()), arr_grad);
   }
 }
 
diff --git a/paddle/fluid/primitive/decomp_utils/decomp_utils.h b/paddle/fluid/primitive/decomp_utils/decomp_utils.h
index 0509b2699f40c..98e017b9d79f6 100644
--- a/paddle/fluid/primitive/decomp_utils/decomp_utils.h
+++ b/paddle/fluid/primitive/decomp_utils/decomp_utils.h
@@ -284,7 +284,7 @@ static bool has_dynamic_shape(const std::vector<int64_t>& shape,
 }
 
 template <typename T>
-Tensor ConverToMT(const Tensor& x) {
+Tensor ConvertToMT(const Tensor& x) {
   bool need_cast = x.dtype() == phi::DataType::FLOAT16 ||
                    x.dtype() == phi::DataType::BFLOAT16 ||
                    x.dtype() == phi::DataType::UINT16;
@@ -295,7 +295,7 @@ Tensor ConverToMT(const Tensor& x) {
 }
 
 template <typename T>
-Tensor ConverToOrig(const Tensor& out, phi::DataType input_dtype) {
+Tensor ConvertToOrig(const Tensor& out, phi::DataType input_dtype) {
   bool need_cast = out.dtype() != input_dtype;
   if (need_cast) {
     return cast<T>(out, input_dtype);