diff --git a/csrc/cpu/src/avx_weight_only.cc b/csrc/cpu/src/avx_weight_only.cc
index 2f0ce1acf7cf..399a28abd21e 100644
--- a/csrc/cpu/src/avx_weight_only.cc
+++ b/csrc/cpu/src/avx_weight_only.cc
@@ -137,12 +137,12 @@ std::vector<paddle::Tensor> InvokeAvxWeightOnly(const paddle::Tensor &x,
 
 std::vector<std::vector<int64_t>> AvxWeightOnlyInferShape(
     std::vector<int64_t> x_shape,
-    std::vector<int64_t> weigh_shape) {
+    std::vector<int64_t> weight_shape) {
   int m = 1;
   for (int i = 0; i < x_shape.size() - 1; i++) {
     m = m * x_shape[i];
   }
-  return {std::vector<int64_t>{m, weigh_shape[1]}};
+  return {std::vector<int64_t>{m, weight_shape[1]}};
 }
 
 std::vector<paddle::DataType> AvxWeightOnlyInferDtype(
diff --git a/docs/llm/docs/llm_trainer.rst b/docs/llm/docs/llm_trainer.rst
index e69721fdee6d..c8685de337c9 100644
--- a/docs/llm/docs/llm_trainer.rst
+++ b/docs/llm/docs/llm_trainer.rst
@@ -50,8 +50,8 @@ Trainer进阶分布式能力使用介绍
 
 注：
 
-* 总卡数=sharding_parallel_dergee * tensor_parallel_dergee * pipeline_parallel_degree * data_parallel_degree
-* data_parallel_degree 不需要传入参数设置，由 总卡数/(sharding_parallel_dergee * tensor_parallel_dergee * pipeline_parallel_degree) 计算得来 
+* 总卡数=sharding_parallel_degree * tensor_parallel_degree * pipeline_parallel_degree * data_parallel_degree
+* data_parallel_degree 不需要传入参数设置，由 总卡数/(sharding_parallel_degree * tensor_parallel_degree * pipeline_parallel_degree) 计算得来 
 
 .. code-block:: bash
 
@@ -59,36 +59,36 @@ Trainer进阶分布式能力使用介绍
     python train.py
 
     # 单机(多机)多卡/数据并行
-    paddle.distruted.launch --devices "0,1,2,3,4,5,6,7" train.py
+    paddle.distributed.launch --devices "0,1,2,3,4,5,6,7" train.py
 
     # 单机(多机)多卡/Sharding并行 
-    paddle.distruted.launch --devices "0,1,2,3,4,5,6,7" train.py -sharding "stage2"
+    paddle.distributed.launch --devices "0,1,2,3,4,5,6,7" train.py -sharding "stage2"
 
     # 单机(多机)多卡/Sharding并行 + 数据并行 (sharding4 dp2)
-    paddle.distruted.launch --devices "0,1,2,3,4,5,6,7" train.py --sharding "stage2" --sharding_parallel_degree 4
+    paddle.distributed.launch --devices "0,1,2,3,4,5,6,7" train.py --sharding "stage2" --sharding_parallel_degree 4
 
     # 单机(多机)多卡/ 张量并行 TP8
-    paddle.distruted.launch --devices "0,1,2,3,4,5,6,7" train.py --tensor_parallel_degree 8
+    paddle.distributed.launch --devices "0,1,2,3,4,5,6,7" train.py --tensor_parallel_degree 8
 
     # 单机(多机)多卡/ 张量并行+数据并行 TP4 DP2
-    paddle.distruted.launch --devices "0,1,2,3,4,5,6,7" train.py --tensor_parallel_degree 4
+    paddle.distributed.launch --devices "0,1,2,3,4,5,6,7" train.py --tensor_parallel_degree 4
 
     # 单机(多机)多卡/ 张量并行+sharding并行 TP4 Sharding2
-    paddle.distruted.launch --devices "0,1,2,3,4,5,6,7" train.py --tensor_parallel_degree 4 \
+    paddle.distributed.launch --devices "0,1,2,3,4,5,6,7" train.py --tensor_parallel_degree 4 \
         --sharding "stage1"  --sharding_parallel_degree 2
 
     # 单机(多机)多卡/ 张量并行+流水线并行 TP2 PP4
-    paddle.distruted.launch --devices "0,1,2,3,4,5,6,7" train.py --tensor_parallel_degree 2 \
+    paddle.distributed.launch --devices "0,1,2,3,4,5,6,7" train.py --tensor_parallel_degree 2 \
         --pipeline_parallel_degree 4
 
     # 单机(多机)多卡/ 张量并行+流水线并行+sharding并行  TP2 PP2 Sharding2
-    paddle.distruted.launch --devices "0,1,2,3,4,5,6,7" train.py --tensor_parallel_degree 2 \
+    paddle.distributed.launch --devices "0,1,2,3,4,5,6,7" train.py --tensor_parallel_degree 2 \
         --pipeline_parallel_degree 2 \
         --sharding "stage1"  --sharding_parallel_degree 2
 
     # 4D 并行，需要两机
     # 单机(多机)多卡/ 张量并行+流水线并行+sharding并行  TP2 PP2 Sharding2 DP2
-    paddle.distruted.launch --devices "0,1,2,3,4,5,6,7" train.py --tensor_parallel_degree 2 \
+    paddle.distributed.launch --devices "0,1,2,3,4,5,6,7" train.py --tensor_parallel_degree 2 \
         --pipeline_parallel_degree 2 \
         --sharding "stage1"  --sharding_parallel_degree 2
 
@@ -115,7 +115,7 @@ Trainer 分布式能力
 
 对于通用的分布式能力, PaddleNLP适配了数据并行data_parallel, 分布式参数sharding功能的支持。
 
-用户使用 paddle.distruted.launch --devices "0,1,2,3" train.py即可将运行的程序切换为多卡数据并行. 如果想要使用sharding功能, 减少模型显存占用, 指定参数--sharding "stage2"即可. 更多sharding功能配置见参数介绍部分.
+用户使用 paddle.distributed.launch --devices "0,1,2,3" train.py即可将运行的程序切换为多卡数据并行. 如果想要使用sharding功能, 减少模型显存占用, 指定参数--sharding "stage2"即可. 更多sharding功能配置见参数介绍部分.
 
 DP 或者sharding，这类功能无需用户修改组网, 直接多卡即可运行。目前已经支持PaddleNLP所有模型。
 
diff --git a/paddlenlp/cli/main.py b/paddlenlp/cli/main.py
index d79904c16c83..928309d70a73 100644
--- a/paddlenlp/cli/main.py
+++ b/paddlenlp/cli/main.py
@@ -20,7 +20,7 @@
 
 from paddlenlp.utils.import_utils import is_package_available
 
-# check whether the package is avaliable and give friendly description.
+# check whether the package is available and give friendly description.
 if not is_package_available("typer"):
     raise ModuleNotFoundError(
         "paddlenlp-cli tools is not installed correctly, you can use the following command"
@@ -110,7 +110,7 @@ def download(
     >>> paddlenlp download -c ./my-models -f bert \n
 
     Args:\n
-        model_name (str): pretarined model name, you can checkout all of model from source code. \n
+        model_name (str): pretrained model name, you can checkout all of model from source code. \n
         cache_dir (str, optional): the cache_dir. Defaults to "./models".
     """
     if not os.path.isabs(cache_dir):
@@ -241,7 +241,7 @@ def install(
 
 
 def main():
-    """the PaddleNLPCLI entry"""
+    """the PaddleNLP CLI entry"""
     app()
 
 
diff --git a/paddlenlp/data/causal_dataset.py b/paddlenlp/data/causal_dataset.py
index 05a4b679835a..8b6c09dd9935 100644
--- a/paddlenlp/data/causal_dataset.py
+++ b/paddlenlp/data/causal_dataset.py
@@ -91,7 +91,7 @@ def get_datasets_weights_and_num_samples(data_prefix, train_val_test_num_samples
     assert weight_sum > 0.0
     weights = [weight / weight_sum for weight in weights]
 
-    # Add 0.5% (the 1.005 factor) so in case the bleding dataset does
+    # Add 0.5% (the 1.005 factor) so in case the blending dataset does
     # not uniformly distribute the number of samples, we still have
     # samples left to feed to the network.
     # (NOTE, yujun06): This is a workaround to avoid issues with indexing in the blending dataset. Therefore, we need to add 20 samples to each dataset.
@@ -349,7 +349,7 @@ def __init__(
             paddle.distributed.barrier()
 
     def __len__(self):
-        # -1 is due to data structure used to retieve the index:
+        # -1 is due to data structure used to retrieve the index:
         #    sample i --> [sample_idx[i], sample_idx[i+1])
         return self.sample_idx.shape[0] - 1
 
@@ -475,7 +475,7 @@ def _build_index_mappings(
     # if build_indices and paddle.distributed.get_rank() == 0:
 
     print(
-        f"searching for causual dataset, build_indices={build_indices}, share_folder {share_folder}, check_rank_flag {check_rank_flag}",
+        f"searching for causal dataset, build_indices={build_indices}, share_folder {share_folder}, check_rank_flag {check_rank_flag}",
         flush=True,
     )
     if check_rank_flag:
@@ -500,7 +500,7 @@ def _build_index_mappings(
                 num_samples_per_epoch + 1
             ), "last epoch number of samples exceeded max value."
             # If we have less than 80% of the samples for the last epoch,
-            # seperate out the epoch and treat it differently.
+            # separate out the epoch and treat it differently.
             # Note: the 80% number is just based on common sense and can
             # be adjusted if needed.
             separate_last_epoch = last_epoch_num_samples < int(0.80 * num_samples_per_epoch)
@@ -530,7 +530,7 @@ def _build_index_mappings(
             doc_idx = _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch)
             np.save(idx_path["doc"], doc_idx, allow_pickle=True)
             print_rank_0(
-                " > elasped time to build and save doc-idx mapping "
+                " > elapsed time to build and save doc-idx mapping "
                 "(seconds): {:4f}".format(time.time() - start_time)
             )
             # sample-idx.
@@ -545,12 +545,12 @@ def _build_index_mappings(
             sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch)
             np.save(idx_path["sample"], sample_idx, allow_pickle=True)
             print_rank_0(
-                " > elasped time to build and save sample-idx mapping "
+                " > elapsed time to build and save sample-idx mapping "
                 "(seconds): {:4f}".format(time.time() - start_time)
             )
             # shuffle-idx.
             start_time = time.time()
-            # -1 is due to data structure used to retieve the index:
+            # -1 is due to data structure used to retrieve the index:
             #    sample i --> [sample_idx[i], sample_idx[i+1])
             if separate_last_epoch:
                 num_samples_ = num_samples_from_epochs_minus_one
@@ -559,7 +559,7 @@ def _build_index_mappings(
             shuffle_idx = _build_shuffle_idx(num_samples_, sample_idx.shape[0] - 1, np_rng)
             np.save(idx_path["shuffle"], shuffle_idx, allow_pickle=True)
             print_rank_0(
-                " > elasped time to build and save shuffle-idx mapping"
+                " > elapsed time to build and save shuffle-idx mapping"
                 " (seconds): {:4f}".format(time.time() - start_time)
             )
         except OSError:
@@ -610,7 +610,7 @@ def _num_tokens(documents, sizes):
 
 
 def _num_epochs(tokens_per_epoch, seq_length, num_samples):
-    """Based on number of samples and sequence lenght, calculate how many
+    """Based on number of samples and sequence length, calculate how many
     epochs will be needed."""
     num_epochs = 0
     total_tokens = 0
@@ -625,7 +625,7 @@ def _num_epochs(tokens_per_epoch, seq_length, num_samples):
 
 
 def _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch):
-    """Build an array with length = number-of-epochs * number-of-dcuments.
+    """Build an array with length = number-of-epochs * number-of-documents.
     Each index is mapped to a corresponding document."""
     if not separate_last_epoch or num_epochs == 1:
         doc_idx = np.mgrid[0:num_epochs, 0 : len(documents)][1]
@@ -654,7 +654,7 @@ def _build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch):
     sample_index = 0
     # Index into doc_idx.
     doc_idx_index = 0
-    # Begining offset for each document.
+    # Beginning offset for each document.
     doc_offset = 0
     # Start with first document and no offset.
     sample_idx[sample_index][0] = doc_idx_index
diff --git a/paddlenlp/data/dist_dataloader.py b/paddlenlp/data/dist_dataloader.py
index 40f586af96b9..0f125f4b76c1 100644
--- a/paddlenlp/data/dist_dataloader.py
+++ b/paddlenlp/data/dist_dataloader.py
@@ -183,7 +183,7 @@ def _broadcast_data(self, data):
             data = nested_broadcast_tensor(data, src=self.mp_src_rank, group=self.mp_group)
         if dst_pp_group is not None:
             data = nested_broadcast_tensor(data, src=dst_pp_group.ranks[0], group=dst_pp_group)
-        # for pp1 - pp_{n-1}, Paddle need to recevie empty dict for pipeline parallel.
+        # for pp1 - pp_{n-1}, Paddle need to receive empty dict for pipeline parallel.
         if data is None:
             data = {}
 
diff --git a/paddlenlp/dataaug/char.py b/paddlenlp/dataaug/char.py
index dbfc3b61caee..5f2c43d1fcf6 100644
--- a/paddlenlp/dataaug/char.py
+++ b/paddlenlp/dataaug/char.py
@@ -76,7 +76,7 @@ def __init__(
         if isinstance(aug_type, str):
             self.type = aug_type
             if aug_type in ["antonym", "homonym", "custom"]:
-                self.dict = self._load_substitue_dict(aug_type)
+                self.dict = self._load_substitute_dict(aug_type)
             elif aug_type in ["mlm"]:
                 self.mlm_model = AutoModelForMaskedLM.from_pretrained(self.model_name)
                 self.mlm_tokenizer = AutoTokenizer.from_pretrained(self.model_name)
@@ -92,7 +92,7 @@ def __init__(
             # Merge dictionaries from different sources
             for t in aug_type:
                 if t in ["antonym", "homonym", "custom"]:
-                    t_dict = self._load_substitue_dict(t)
+                    t_dict = self._load_substitute_dict(t)
                     for k in t_dict:
                         if k in self.dict:
                             self.dict[k] = list(set(self.dict[k] + t_dict[k]))
@@ -101,7 +101,7 @@ def __init__(
         else:
             self.type = aug_type
 
-    def _load_substitue_dict(self, source_type):
+    def _load_substitute_dict(self, source_type):
         """Load substitution dictionary"""
         if source_type in ["antonym", "homonym"]:
             fullname = self._load_file("char_" + source_type)
@@ -112,15 +112,15 @@ def _load_substitue_dict(self, source_type):
 
         if os.path.exists(fullname):
             with open(fullname, "r", encoding="utf-8") as f:
-                substitue_dict = json.load(f)
+                substitute_dict = json.load(f)
             f.close()
         else:
             raise ValueError("The {} should exist.".format(fullname))
 
-        return substitue_dict
+        return substitute_dict
 
     def _generate_sequence(self, output_seq_tokens, aug_tokens):
-        """Genearte the sequences according to the mapping list"""
+        """Generate the sequences according to the mapping list"""
         for aug_token in aug_tokens:
             idx, token = aug_token
             output_seq_tokens[int(idx)] = token
diff --git a/paddlenlp/dataaug/word.py b/paddlenlp/dataaug/word.py
index 438935e54ac4..b4e5833af545 100644
--- a/paddlenlp/dataaug/word.py
+++ b/paddlenlp/dataaug/word.py
@@ -86,7 +86,7 @@ def __init__(
         if isinstance(aug_type, str):
             self.type = aug_type
             if aug_type in ["antonym", "embedding", "synonym", "homonym", "custom"]:
-                self.dict = self._load_substitue_dict(aug_type)
+                self.dict = self._load_substitute_dict(aug_type)
             elif aug_type in ["mlm"]:
                 self.mlm_model = AutoModelForMaskedLM.from_pretrained(self.model_name)
                 self.mlm_tokenizer = AutoTokenizer.from_pretrained(self.model_name)
@@ -102,7 +102,7 @@ def __init__(
             # Merge dictionaries from different sources
             for t in aug_type:
                 if t in ["antonym", "embedding", "synonym", "homonym", "custom"]:
-                    t_dict = self._load_substitue_dict(t)
+                    t_dict = self._load_substitute_dict(t)
                     for k in t_dict:
                         if k in self.dict:
                             self.dict[k] = list(set(self.dict[k] + t_dict[k]))
@@ -151,7 +151,7 @@ def _calculate_tfidf(self, sequence, seq_tokens, aug_indexes):
             tfidf.append(tf * idf)
         return np.array(tfidf)
 
-    def _load_substitue_dict(self, source_type):
+    def _load_substitute_dict(self, source_type):
         """Load substitution dictionary"""
         if source_type in ["antonym", "embedding", "synonym", "homonym"]:
             fullname = self._load_file("word_" + source_type)
@@ -162,12 +162,12 @@ def _load_substitue_dict(self, source_type):
 
         if os.path.exists(fullname):
             with open(fullname, "r", encoding="utf-8") as f:
-                substitue_dict = json.load(f)
+                substitute_dict = json.load(f)
             f.close()
         else:
             raise ValueError("The {} should exist.".format(fullname))
 
-        return substitue_dict
+        return substitute_dict
 
     def _generate_sequence(self, output_seq_tokens, aug_tokens):
         """Genearte the sequences according to the mapping list"""
diff --git a/paddlenlp/transformers/ernie_vil/configuration.py b/paddlenlp/transformers/ernie_vil/configuration.py
index 41e9a43b5fba..b7d773ccaad6 100644
--- a/paddlenlp/transformers/ernie_vil/configuration.py
+++ b/paddlenlp/transformers/ernie_vil/configuration.py
@@ -176,7 +176,7 @@ class ErnieViLVisionConfig(PretrainedConfig):
         layer_norm_eps (`float`, *optional*,
             defaults to 1e-6): The epsilon used by the layer normalization layers.
         dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
@@ -266,7 +266,7 @@ class ErnieViLConfig(PretrainedConfig):
         vision_config (`dict`, *optional*):
             Dictionary of configuration options used to initialize [`ErnieViLVisionConfig`].
         logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
-            The inital value of the *logit_scale* paramter. Default is used as per the original ErnieViL implementation.
+            The initial value of the *logit_scale* parameter. Default is used as per the original ErnieViL implementation.
         kwargs (*optional*):
             Dictionary of keyword arguments.
 
diff --git a/paddlenlp/transformers/ernie_vil/image_processing.py b/paddlenlp/transformers/ernie_vil/image_processing.py
index 5873eb2a51c5..3b1c4b9f16bf 100644
--- a/paddlenlp/transformers/ernie_vil/image_processing.py
+++ b/paddlenlp/transformers/ernie_vil/image_processing.py
@@ -134,7 +134,7 @@ def resize(
             size (`Dict[str, int]`):
                 Size of the output image.
             resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
-                Resampling filter to use when resiizing the image.
+                Resampling filter to use when resizing the image.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the image. If not provided, it will be the same as the input image.
         """
diff --git a/paddlenlp/transformers/qwen2_moe/configuration.py b/paddlenlp/transformers/qwen2_moe/configuration.py
index baf7a2551afa..c53f502978c4 100644
--- a/paddlenlp/transformers/qwen2_moe/configuration.py
+++ b/paddlenlp/transformers/qwen2_moe/configuration.py
@@ -87,7 +87,7 @@ class Qwen2MoeConfig(PretrainedConfig):
         norm_topk_prob (`bool`, *optional*, defaults to `False`):
             Whether to normalize the topk probabilities.
         output_router_logits (`bool`, *optional*, defaults to `False`):
-            Whether or not the router logits should be returned by the model. Enabeling this will also
+            Whether or not the router logits should be returned by the model. Enabling this will also
             allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
         router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
             The aux loss factor for the total loss.
diff --git a/paddlenlp/transformers/tokenizer_utils.py b/paddlenlp/transformers/tokenizer_utils.py
index bdccbeae8529..db885c865cbc 100644
--- a/paddlenlp/transformers/tokenizer_utils.py
+++ b/paddlenlp/transformers/tokenizer_utils.py
@@ -95,7 +95,7 @@ def convert_to_unicode(text):
 
 def whitespace_tokenize(text):
     """
-    Runs basic whitespace cleaning and splitting on a peice of text.
+    Runs basic whitespace cleaning and splitting on a piece of text.
     Args:
         text (str): Text to be tokenized.
     Returns:
@@ -112,7 +112,7 @@ def _is_whitespace(char):
     """
     Checks whether `chars` is a whitespace character.
     """
-    # \t, \n, and \r are technically contorl characters but we treat them
+    # \t, \n, and \r are technically control characters but we treat them
     # as whitespace since they are generally considered as such.
     if char == " " or char == "\t" or char == "\n" or char == "\r":
         return True