Skip to content

Commit

Permalink
add local_layer in moe layer
Browse files Browse the repository at this point in the history
  • Loading branch information
pkuzyc committed Feb 17, 2025
1 parent 31ec76c commit 0a67c68
Show file tree
Hide file tree
Showing 6 changed files with 1,005 additions and 28 deletions.
6 changes: 3 additions & 3 deletions llm/auto_parallel/deepseek-v2/run_pretrain_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,9 +553,9 @@ def main():
# if training_args.bf16:
# dtype = "bfloat16"

with paddle.LazyGuard():
model = model_class.from_config(config, dtype="float32")
criterion = criterion_class(config)
# with paddle.LazyGuard():
model = model_class.from_config(config, dtype="float32")
criterion = criterion_class(config)

if training_args.recompute:

Expand Down
19 changes: 11 additions & 8 deletions llm/auto_parallel/deepseek-v2/run_pretrain_auto.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,11 @@ unset CUDA_VISIBLE_DEVICES
task_name="deepseekv2"
rm -rf output/$task_name/
rm -rf "output/$task_name""_log"
rm -rf /root/paddlejob/workspace/env_run/xuxinyi/PaddleNLP/llm/auto_parallel/deepseek-v2/log
rm -rf /root/paddle/paddle_models/PaddleNLP/llm/auto_parallel/deepseek-v2/log
# rm -rf /root/paddlejob/workspace/env_run/xuxinyi/PaddleNLP/llm/auto_parallel/deepseek-v2/log

export SOT_LOG_LEVEL=4
export PYTHONPATH=/root/paddlejob/workspace/env_run/xuxinyi/PaddleNLP:$PYTHONPATH
# export PYTHONPATH=/root/paddlejob/workspace/env_run/xuxinyi/PaddleNLP:$PYTHONPATH
#ulimit -c unlimited
# export GLOG_v=3

Expand All @@ -33,29 +34,31 @@ export PYTHONPATH=/root/paddlejob/workspace/env_run/xuxinyi/PaddleNLP:$PYTHONPAT
# export FLAGS_cudnn_deterministic=1
# export NVIDIA_TF32_OVERRIDE=0

export GLOG_v=6
expert FLAGS_call_stack_level=2
to_static=0 # 是否开启动转静训练

python -u -m paddle.distributed.launch \
--gpus "0,1,2,3" \
--gpus "2,3" \
--log_dir "log" \
run_pretrain_auto.py \
--model_type "deepseekv2_auto" \
--model_name_or_path "deepseek-ai/DeepSeek-V2-Lite" \
--tokenizer_name_or_path "deepseek-ai/DeepSeek-V2-Lite" \
--model_name_or_path "./model_config" \
--tokenizer_name_or_path "./model_config" \
--input_dir "./data" \
--output_dir "output/$task_name" \
--split 949,50,1 \
--max_seq_length 2048 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 2 \
--gradient_accumulation_steps 2 \
--gradient_accumulation_steps 1 \
--use_flash_attention 0 \
--use_fused_rms_norm 1 \
--fp16 0 \
--fp16_opt_level "O2" \
--fp16_opt_level "O0" \
--scale_loss 1024 \
--pipeline_parallel_degree 1 \
--tensor_parallel_degree 2 \
--tensor_parallel_degree 1 \
--sharding_parallel_degree 1 \
--learning_rate 0.0001 \
--min_learning_rate 0.00001 \
Expand Down
6 changes: 6 additions & 0 deletions paddlenlp/transformers/deepseek_v2/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -647,6 +647,8 @@ def __init__(self, config, num_experts, expert_hidden_size, **kwargs):
is_bias=False,
default_initializer=nn.initializer.Constant(1.0),
)
print("==== weight after init ====")
print(self.weight)

if config.topk_method == "noaux_tc":
self.e_score_correction_bias = paddle.create_parameter(
Expand All @@ -663,6 +665,10 @@ def forward(self, hidden_states):
_, h_dim = hidden_states.shape

# compute gating score
print("==== weight ====")
print(self.weight)
print("==== hidden_states ====")
print(hidden_states)
logits = F.linear(hidden_states, self.weight, None)

with paddle.amp.auto_cast(False):
Expand Down
96 changes: 79 additions & 17 deletions paddlenlp/transformers/deepseek_v2/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,17 +49,16 @@
from ..llama.modeling import get_use_casual_mask
from ..model_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
from ..model_utils import PretrainedModel, register_base_model
from ..moe_layer import MoELayer
from ..moe_layer_auto import MoELayer
from ..moe_gate_auto import PretrainedMoEGate
from .configuration import DeepseekV2Config
from .modeling import (
AddAuxiliaryLoss,
DeepseekV2DynamicNTKScalingRotaryEmbedding,
DeepseekV2LinearScalingRotaryEmbedding,
DeepseekV2PretrainingCriterion,
DeepseekV2RMSNorm,
DeepseekV2RotaryEmbedding,
DeepseekV2YarnRotaryEmbedding,
MoEGate,
_expand_2d_mask,
_make_causal_mask,
apply_rotary_pos_emb,
Expand Down Expand Up @@ -169,6 +168,68 @@ def scaled_dot_product_attention(
return (attn_output, attn_weights) if output_attentions else attn_output


class MoEGate(PretrainedMoEGate):
def __init__(self, config, num_experts, expert_hidden_size, **kwargs):
super().__init__(config, num_experts, expert_hidden_size, **kwargs)
# [hidden_size, n_expert]

self.scoring_func = config.scoring_func
self.topk_method = config.topk_method

self.weight = paddle.create_parameter(
shape=[expert_hidden_size, num_experts],
dtype=paddle.get_default_dtype(),
is_bias=False,
default_initializer=nn.initializer.Constant(1.0),
)

if config.topk_method == "noaux_tc":
self.e_score_correction_bias = paddle.create_parameter(
shape=[num_experts],
dtype=paddle.get_default_dtype(),
default_initializer=nn.initializer.Constant(0.0),
)

def forward(self, hidden_states):
"""
Args:
hidden_states (_type_): [batch_size * seq_len, hidden_size]
"""
_, h_dim = hidden_states.shape

# compute gating score
logits = F.linear(hidden_states, self.weight, None)

with paddle.amp.auto_cast(False):
scores = self.gate_score_func(logits=logits)
scores = scores.cast(paddle.get_default_dtype())

capacity, combine_weights, dispatch_mask, exp_counts, l_aux, l_zloss = self.topkgating(scores)

return capacity, combine_weights, dispatch_mask, exp_counts, l_aux, l_zloss


class AddAuxiliaryLoss(paddle.autograd.PyLayer):
"""
The trick function of adding auxiliary (aux) loss,
which includes the gradient of the aux loss during backpropagation.
"""

@staticmethod
def forward(ctx, x, loss):
assert paddle.numel(loss) == 1
ctx.dtype = loss.dtype
ctx.required_aux_loss = not loss.stop_gradient
return x

@staticmethod
def backward(ctx, grad_output):
grad_loss = None
if ctx.required_aux_loss:
grad_loss = paddle.ones(1, dtype=ctx.dtype)
return grad_output, grad_loss


class DeepseekV2MLPAuto(nn.Layer):
def __init__(self, config: DeepseekV2Config, hidden_size=None, intermediate_size=None, is_moe=False):
super().__init__()
Expand Down Expand Up @@ -972,18 +1033,19 @@ def _reorder_cache(past_key_values, beam_idx):
def auto_dist_config(self, prefix=""):
if prefix != "":
assert prefix.endswith(".")
config = {
"mp_config": {
"parallelize_plan": {
f"{prefix}deepseek_v2.embed_tokens": dist.ColWiseParallel(gather_output=True),
f"{prefix}deepseek_v2.layers.*.self_attn.q_proj": dist.ColWiseParallel(),
f"{prefix}deepseek_v2.layers.*.self_attn.kv_b_proj": dist.ColWiseParallel(),
f"{prefix}deepseek_v2.layers.*.self_attn.o_proj": dist.RowWiseParallel(),
f"{prefix}deepseek_v2.layers.*.mlp.gate_proj": dist.ColWiseParallel(),
f"{prefix}deepseek_v2.layers.*.mlp.up_proj": dist.ColWiseParallel(),
f"{prefix}deepseek_v2.layers.*.mlp.down_proj": dist.RowWiseParallel(),
f"{prefix}lm_head.weight": dist.ColWiseParallel(),
}
},
}
config = {}
# config = {
# "mp_config": {
# "parallelize_plan": {
# f"{prefix}deepseek_v2.embed_tokens": dist.ColWiseParallel(gather_output=True),
# f"{prefix}deepseek_v2.layers.*.self_attn.q_proj": dist.ColWiseParallel(),
# f"{prefix}deepseek_v2.layers.*.self_attn.kv_b_proj": dist.ColWiseParallel(),
# f"{prefix}deepseek_v2.layers.*.self_attn.o_proj": dist.RowWiseParallel(),
# f"{prefix}deepseek_v2.layers.*.mlp.gate_proj": dist.ColWiseParallel(),
# f"{prefix}deepseek_v2.layers.*.mlp.up_proj": dist.ColWiseParallel(),
# f"{prefix}deepseek_v2.layers.*.mlp.down_proj": dist.RowWiseParallel(),
# f"{prefix}lm_head.weight": dist.ColWiseParallel(),
# }
# },
# }
return config
Loading

0 comments on commit 0a67c68

Please sign in to comment.