diff --git a/csrc/cpu/src/avx_weight_only.cc b/csrc/cpu/src/avx_weight_only.cc index 2f0ce1acf7cf..399a28abd21e 100644 --- a/csrc/cpu/src/avx_weight_only.cc +++ b/csrc/cpu/src/avx_weight_only.cc @@ -137,12 +137,12 @@ std::vector InvokeAvxWeightOnly(const paddle::Tensor &x, std::vector> AvxWeightOnlyInferShape( std::vector x_shape, - std::vector weigh_shape) { + std::vector weight_shape) { int m = 1; for (int i = 0; i < x_shape.size() - 1; i++) { m = m * x_shape[i]; } - return {std::vector{m, weigh_shape[1]}}; + return {std::vector{m, weight_shape[1]}}; } std::vector AvxWeightOnlyInferDtype( diff --git a/docs/llm/docs/llm_trainer.rst b/docs/llm/docs/llm_trainer.rst index e69721fdee6d..c8685de337c9 100644 --- a/docs/llm/docs/llm_trainer.rst +++ b/docs/llm/docs/llm_trainer.rst @@ -50,8 +50,8 @@ Trainer进阶分布式能力使用介绍 注: -* 总卡数=sharding_parallel_dergee * tensor_parallel_dergee * pipeline_parallel_degree * data_parallel_degree -* data_parallel_degree 不需要传入参数设置,由 总卡数/(sharding_parallel_dergee * tensor_parallel_dergee * pipeline_parallel_degree) 计算得来 +* 总卡数=sharding_parallel_degree * tensor_parallel_degree * pipeline_parallel_degree * data_parallel_degree +* data_parallel_degree 不需要传入参数设置,由 总卡数/(sharding_parallel_degree * tensor_parallel_degree * pipeline_parallel_degree) 计算得来 .. code-block:: bash @@ -59,36 +59,36 @@ Trainer进阶分布式能力使用介绍 python train.py # 单机(多机)多卡/数据并行 - paddle.distruted.launch --devices "0,1,2,3,4,5,6,7" train.py + paddle.distributed.launch --devices "0,1,2,3,4,5,6,7" train.py # 单机(多机)多卡/Sharding并行 - paddle.distruted.launch --devices "0,1,2,3,4,5,6,7" train.py -sharding "stage2" + paddle.distributed.launch --devices "0,1,2,3,4,5,6,7" train.py -sharding "stage2" # 单机(多机)多卡/Sharding并行 + 数据并行 (sharding4 dp2) - paddle.distruted.launch --devices "0,1,2,3,4,5,6,7" train.py --sharding "stage2" --sharding_parallel_degree 4 + paddle.distributed.launch --devices "0,1,2,3,4,5,6,7" train.py --sharding "stage2" --sharding_parallel_degree 4 # 单机(多机)多卡/ 张量并行 TP8 - paddle.distruted.launch --devices "0,1,2,3,4,5,6,7" train.py --tensor_parallel_degree 8 + paddle.distributed.launch --devices "0,1,2,3,4,5,6,7" train.py --tensor_parallel_degree 8 # 单机(多机)多卡/ 张量并行+数据并行 TP4 DP2 - paddle.distruted.launch --devices "0,1,2,3,4,5,6,7" train.py --tensor_parallel_degree 4 + paddle.distributed.launch --devices "0,1,2,3,4,5,6,7" train.py --tensor_parallel_degree 4 # 单机(多机)多卡/ 张量并行+sharding并行 TP4 Sharding2 - paddle.distruted.launch --devices "0,1,2,3,4,5,6,7" train.py --tensor_parallel_degree 4 \ + paddle.distributed.launch --devices "0,1,2,3,4,5,6,7" train.py --tensor_parallel_degree 4 \ --sharding "stage1" --sharding_parallel_degree 2 # 单机(多机)多卡/ 张量并行+流水线并行 TP2 PP4 - paddle.distruted.launch --devices "0,1,2,3,4,5,6,7" train.py --tensor_parallel_degree 2 \ + paddle.distributed.launch --devices "0,1,2,3,4,5,6,7" train.py --tensor_parallel_degree 2 \ --pipeline_parallel_degree 4 # 单机(多机)多卡/ 张量并行+流水线并行+sharding并行 TP2 PP2 Sharding2 - paddle.distruted.launch --devices "0,1,2,3,4,5,6,7" train.py --tensor_parallel_degree 2 \ + paddle.distributed.launch --devices "0,1,2,3,4,5,6,7" train.py --tensor_parallel_degree 2 \ --pipeline_parallel_degree 2 \ --sharding "stage1" --sharding_parallel_degree 2 # 4D 并行,需要两机 # 单机(多机)多卡/ 张量并行+流水线并行+sharding并行 TP2 PP2 Sharding2 DP2 - paddle.distruted.launch --devices "0,1,2,3,4,5,6,7" train.py --tensor_parallel_degree 2 \ + paddle.distributed.launch --devices "0,1,2,3,4,5,6,7" train.py --tensor_parallel_degree 2 \ --pipeline_parallel_degree 2 \ --sharding "stage1" --sharding_parallel_degree 2 @@ -115,7 +115,7 @@ Trainer 分布式能力 对于通用的分布式能力, PaddleNLP适配了数据并行data_parallel, 分布式参数sharding功能的支持。 -用户使用 paddle.distruted.launch --devices "0,1,2,3" train.py即可将运行的程序切换为多卡数据并行. 如果想要使用sharding功能, 减少模型显存占用, 指定参数--sharding "stage2"即可. 更多sharding功能配置见参数介绍部分. +用户使用 paddle.distributed.launch --devices "0,1,2,3" train.py即可将运行的程序切换为多卡数据并行. 如果想要使用sharding功能, 减少模型显存占用, 指定参数--sharding "stage2"即可. 更多sharding功能配置见参数介绍部分. DP 或者sharding,这类功能无需用户修改组网, 直接多卡即可运行。目前已经支持PaddleNLP所有模型。 diff --git a/paddlenlp/cli/main.py b/paddlenlp/cli/main.py index d79904c16c83..928309d70a73 100644 --- a/paddlenlp/cli/main.py +++ b/paddlenlp/cli/main.py @@ -20,7 +20,7 @@ from paddlenlp.utils.import_utils import is_package_available -# check whether the package is avaliable and give friendly description. +# check whether the package is available and give friendly description. if not is_package_available("typer"): raise ModuleNotFoundError( "paddlenlp-cli tools is not installed correctly, you can use the following command" @@ -110,7 +110,7 @@ def download( >>> paddlenlp download -c ./my-models -f bert \n Args:\n - model_name (str): pretarined model name, you can checkout all of model from source code. \n + model_name (str): pretrained model name, you can checkout all of model from source code. \n cache_dir (str, optional): the cache_dir. Defaults to "./models". """ if not os.path.isabs(cache_dir): @@ -241,7 +241,7 @@ def install( def main(): - """the PaddleNLPCLI entry""" + """the PaddleNLP CLI entry""" app() diff --git a/paddlenlp/data/causal_dataset.py b/paddlenlp/data/causal_dataset.py index 05a4b679835a..8b6c09dd9935 100644 --- a/paddlenlp/data/causal_dataset.py +++ b/paddlenlp/data/causal_dataset.py @@ -91,7 +91,7 @@ def get_datasets_weights_and_num_samples(data_prefix, train_val_test_num_samples assert weight_sum > 0.0 weights = [weight / weight_sum for weight in weights] - # Add 0.5% (the 1.005 factor) so in case the bleding dataset does + # Add 0.5% (the 1.005 factor) so in case the blending dataset does # not uniformly distribute the number of samples, we still have # samples left to feed to the network. # (NOTE, yujun06): This is a workaround to avoid issues with indexing in the blending dataset. Therefore, we need to add 20 samples to each dataset. @@ -349,7 +349,7 @@ def __init__( paddle.distributed.barrier() def __len__(self): - # -1 is due to data structure used to retieve the index: + # -1 is due to data structure used to retrieve the index: # sample i --> [sample_idx[i], sample_idx[i+1]) return self.sample_idx.shape[0] - 1 @@ -475,7 +475,7 @@ def _build_index_mappings( # if build_indices and paddle.distributed.get_rank() == 0: print( - f"searching for causual dataset, build_indices={build_indices}, share_folder {share_folder}, check_rank_flag {check_rank_flag}", + f"searching for causal dataset, build_indices={build_indices}, share_folder {share_folder}, check_rank_flag {check_rank_flag}", flush=True, ) if check_rank_flag: @@ -500,7 +500,7 @@ def _build_index_mappings( num_samples_per_epoch + 1 ), "last epoch number of samples exceeded max value." # If we have less than 80% of the samples for the last epoch, - # seperate out the epoch and treat it differently. + # separate out the epoch and treat it differently. # Note: the 80% number is just based on common sense and can # be adjusted if needed. separate_last_epoch = last_epoch_num_samples < int(0.80 * num_samples_per_epoch) @@ -530,7 +530,7 @@ def _build_index_mappings( doc_idx = _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch) np.save(idx_path["doc"], doc_idx, allow_pickle=True) print_rank_0( - " > elasped time to build and save doc-idx mapping " + " > elapsed time to build and save doc-idx mapping " "(seconds): {:4f}".format(time.time() - start_time) ) # sample-idx. @@ -545,12 +545,12 @@ def _build_index_mappings( sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch) np.save(idx_path["sample"], sample_idx, allow_pickle=True) print_rank_0( - " > elasped time to build and save sample-idx mapping " + " > elapsed time to build and save sample-idx mapping " "(seconds): {:4f}".format(time.time() - start_time) ) # shuffle-idx. start_time = time.time() - # -1 is due to data structure used to retieve the index: + # -1 is due to data structure used to retrieve the index: # sample i --> [sample_idx[i], sample_idx[i+1]) if separate_last_epoch: num_samples_ = num_samples_from_epochs_minus_one @@ -559,7 +559,7 @@ def _build_index_mappings( shuffle_idx = _build_shuffle_idx(num_samples_, sample_idx.shape[0] - 1, np_rng) np.save(idx_path["shuffle"], shuffle_idx, allow_pickle=True) print_rank_0( - " > elasped time to build and save shuffle-idx mapping" + " > elapsed time to build and save shuffle-idx mapping" " (seconds): {:4f}".format(time.time() - start_time) ) except OSError: @@ -610,7 +610,7 @@ def _num_tokens(documents, sizes): def _num_epochs(tokens_per_epoch, seq_length, num_samples): - """Based on number of samples and sequence lenght, calculate how many + """Based on number of samples and sequence length, calculate how many epochs will be needed.""" num_epochs = 0 total_tokens = 0 @@ -625,7 +625,7 @@ def _num_epochs(tokens_per_epoch, seq_length, num_samples): def _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch): - """Build an array with length = number-of-epochs * number-of-dcuments. + """Build an array with length = number-of-epochs * number-of-documents. Each index is mapped to a corresponding document.""" if not separate_last_epoch or num_epochs == 1: doc_idx = np.mgrid[0:num_epochs, 0 : len(documents)][1] @@ -654,7 +654,7 @@ def _build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch): sample_index = 0 # Index into doc_idx. doc_idx_index = 0 - # Begining offset for each document. + # Beginning offset for each document. doc_offset = 0 # Start with first document and no offset. sample_idx[sample_index][0] = doc_idx_index diff --git a/paddlenlp/data/dist_dataloader.py b/paddlenlp/data/dist_dataloader.py index 40f586af96b9..0f125f4b76c1 100644 --- a/paddlenlp/data/dist_dataloader.py +++ b/paddlenlp/data/dist_dataloader.py @@ -183,7 +183,7 @@ def _broadcast_data(self, data): data = nested_broadcast_tensor(data, src=self.mp_src_rank, group=self.mp_group) if dst_pp_group is not None: data = nested_broadcast_tensor(data, src=dst_pp_group.ranks[0], group=dst_pp_group) - # for pp1 - pp_{n-1}, Paddle need to recevie empty dict for pipeline parallel. + # for pp1 - pp_{n-1}, Paddle need to receive empty dict for pipeline parallel. if data is None: data = {} diff --git a/paddlenlp/dataaug/char.py b/paddlenlp/dataaug/char.py index dbfc3b61caee..5f2c43d1fcf6 100644 --- a/paddlenlp/dataaug/char.py +++ b/paddlenlp/dataaug/char.py @@ -76,7 +76,7 @@ def __init__( if isinstance(aug_type, str): self.type = aug_type if aug_type in ["antonym", "homonym", "custom"]: - self.dict = self._load_substitue_dict(aug_type) + self.dict = self._load_substitute_dict(aug_type) elif aug_type in ["mlm"]: self.mlm_model = AutoModelForMaskedLM.from_pretrained(self.model_name) self.mlm_tokenizer = AutoTokenizer.from_pretrained(self.model_name) @@ -92,7 +92,7 @@ def __init__( # Merge dictionaries from different sources for t in aug_type: if t in ["antonym", "homonym", "custom"]: - t_dict = self._load_substitue_dict(t) + t_dict = self._load_substitute_dict(t) for k in t_dict: if k in self.dict: self.dict[k] = list(set(self.dict[k] + t_dict[k])) @@ -101,7 +101,7 @@ def __init__( else: self.type = aug_type - def _load_substitue_dict(self, source_type): + def _load_substitute_dict(self, source_type): """Load substitution dictionary""" if source_type in ["antonym", "homonym"]: fullname = self._load_file("char_" + source_type) @@ -112,15 +112,15 @@ def _load_substitue_dict(self, source_type): if os.path.exists(fullname): with open(fullname, "r", encoding="utf-8") as f: - substitue_dict = json.load(f) + substitute_dict = json.load(f) f.close() else: raise ValueError("The {} should exist.".format(fullname)) - return substitue_dict + return substitute_dict def _generate_sequence(self, output_seq_tokens, aug_tokens): - """Genearte the sequences according to the mapping list""" + """Generate the sequences according to the mapping list""" for aug_token in aug_tokens: idx, token = aug_token output_seq_tokens[int(idx)] = token diff --git a/paddlenlp/dataaug/word.py b/paddlenlp/dataaug/word.py index 438935e54ac4..b4e5833af545 100644 --- a/paddlenlp/dataaug/word.py +++ b/paddlenlp/dataaug/word.py @@ -86,7 +86,7 @@ def __init__( if isinstance(aug_type, str): self.type = aug_type if aug_type in ["antonym", "embedding", "synonym", "homonym", "custom"]: - self.dict = self._load_substitue_dict(aug_type) + self.dict = self._load_substitute_dict(aug_type) elif aug_type in ["mlm"]: self.mlm_model = AutoModelForMaskedLM.from_pretrained(self.model_name) self.mlm_tokenizer = AutoTokenizer.from_pretrained(self.model_name) @@ -102,7 +102,7 @@ def __init__( # Merge dictionaries from different sources for t in aug_type: if t in ["antonym", "embedding", "synonym", "homonym", "custom"]: - t_dict = self._load_substitue_dict(t) + t_dict = self._load_substitute_dict(t) for k in t_dict: if k in self.dict: self.dict[k] = list(set(self.dict[k] + t_dict[k])) @@ -151,7 +151,7 @@ def _calculate_tfidf(self, sequence, seq_tokens, aug_indexes): tfidf.append(tf * idf) return np.array(tfidf) - def _load_substitue_dict(self, source_type): + def _load_substitute_dict(self, source_type): """Load substitution dictionary""" if source_type in ["antonym", "embedding", "synonym", "homonym"]: fullname = self._load_file("word_" + source_type) @@ -162,12 +162,12 @@ def _load_substitue_dict(self, source_type): if os.path.exists(fullname): with open(fullname, "r", encoding="utf-8") as f: - substitue_dict = json.load(f) + substitute_dict = json.load(f) f.close() else: raise ValueError("The {} should exist.".format(fullname)) - return substitue_dict + return substitute_dict def _generate_sequence(self, output_seq_tokens, aug_tokens): """Genearte the sequences according to the mapping list""" diff --git a/paddlenlp/transformers/ernie_vil/configuration.py b/paddlenlp/transformers/ernie_vil/configuration.py index 41e9a43b5fba..b7d773ccaad6 100644 --- a/paddlenlp/transformers/ernie_vil/configuration.py +++ b/paddlenlp/transformers/ernie_vil/configuration.py @@ -176,7 +176,7 @@ class ErnieViLVisionConfig(PretrainedConfig): layer_norm_eps (`float`, *optional*, defaults to 1e-6): The epsilon used by the layer normalization layers. dropout (`float`, *optional*, defaults to 0.0): - The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. initializer_range (`float`, *optional*, defaults to 0.02): @@ -266,7 +266,7 @@ class ErnieViLConfig(PretrainedConfig): vision_config (`dict`, *optional*): Dictionary of configuration options used to initialize [`ErnieViLVisionConfig`]. logit_scale_init_value (`float`, *optional*, defaults to 2.6592): - The inital value of the *logit_scale* paramter. Default is used as per the original ErnieViL implementation. + The initial value of the *logit_scale* parameter. Default is used as per the original ErnieViL implementation. kwargs (*optional*): Dictionary of keyword arguments. diff --git a/paddlenlp/transformers/ernie_vil/image_processing.py b/paddlenlp/transformers/ernie_vil/image_processing.py index 5873eb2a51c5..3b1c4b9f16bf 100644 --- a/paddlenlp/transformers/ernie_vil/image_processing.py +++ b/paddlenlp/transformers/ernie_vil/image_processing.py @@ -134,7 +134,7 @@ def resize( size (`Dict[str, int]`): Size of the output image. resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): - Resampling filter to use when resiizing the image. + Resampling filter to use when resizing the image. data_format (`str` or `ChannelDimension`, *optional*): The channel dimension format of the image. If not provided, it will be the same as the input image. """ diff --git a/paddlenlp/transformers/qwen2_moe/configuration.py b/paddlenlp/transformers/qwen2_moe/configuration.py index baf7a2551afa..c53f502978c4 100644 --- a/paddlenlp/transformers/qwen2_moe/configuration.py +++ b/paddlenlp/transformers/qwen2_moe/configuration.py @@ -87,7 +87,7 @@ class Qwen2MoeConfig(PretrainedConfig): norm_topk_prob (`bool`, *optional*, defaults to `False`): Whether to normalize the topk probabilities. output_router_logits (`bool`, *optional*, defaults to `False`): - Whether or not the router logits should be returned by the model. Enabeling this will also + Whether or not the router logits should be returned by the model. Enabling this will also allow the model to output the auxiliary loss, including load balancing loss and router z-loss. router_aux_loss_coef (`float`, *optional*, defaults to 0.001): The aux loss factor for the total loss. diff --git a/paddlenlp/transformers/tokenizer_utils.py b/paddlenlp/transformers/tokenizer_utils.py index bdccbeae8529..db885c865cbc 100644 --- a/paddlenlp/transformers/tokenizer_utils.py +++ b/paddlenlp/transformers/tokenizer_utils.py @@ -95,7 +95,7 @@ def convert_to_unicode(text): def whitespace_tokenize(text): """ - Runs basic whitespace cleaning and splitting on a peice of text. + Runs basic whitespace cleaning and splitting on a piece of text. Args: text (str): Text to be tokenized. Returns: @@ -112,7 +112,7 @@ def _is_whitespace(char): """ Checks whether `chars` is a whitespace character. """ - # \t, \n, and \r are technically contorl characters but we treat them + # \t, \n, and \r are technically control characters but we treat them # as whitespace since they are generally considered as such. if char == " " or char == "\t" or char == "\n" or char == "\r": return True