From 699f73a5fe333e69b0b59fde5ebe1567ab50b054 Mon Sep 17 00:00:00 2001 From: Sijun He Date: Tue, 30 May 2023 21:10:52 +0800 Subject: [PATCH] [Bug fix] fix LoRA unsync parameters issue (#6048) * fix styles * remove extra print --- paddlenlp/layers/lora.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/paddlenlp/layers/lora.py b/paddlenlp/layers/lora.py index b836a84cc8da..1cd18aab8938 100644 --- a/paddlenlp/layers/lora.py +++ b/paddlenlp/layers/lora.py @@ -150,12 +150,15 @@ def __init__( is_bias=False, attr=lora_A_weight_attr, ) + self.lora_A.is_distributed = False self.lora_B = self.create_parameter( shape=[r, self.output_size_per_partition], dtype=self._dtype, is_bias=False, default_initializer=nn.initializer.Constant(value=0.0), ) + self.lora_B.is_distributed = True + self.lora_B.split_axis = 1 self.scaling = self.lora_alpha / self.r # Freezing the pre-trained weight matrix @@ -184,8 +187,9 @@ def forward(self, input: paddle.Tensor): result_mp = F.linear(x=input_mp, weight=self.weight, bias=self.bias, name=self.name) if self.r > 0 and not self.merged: - input_a = self.lora_dropout(input_mp) @ self.lora_A - delta_mp = (input_a @ self.lora_B) * self.scaling + input_a = self.lora_dropout(input) @ self.lora_A + input_a_mp = mp_ops._c_identity(input_a, group=self.model_parallel_group) + delta_mp = (input_a_mp @ self.lora_B) * self.scaling result_mp += delta_mp if self.gather_output and self.is_mp: @@ -378,6 +382,7 @@ def __init__( is_bias=False, attr=lora_A_weight_attr, ) + self.lora_A.is_distributed = False # Make sure lora_B is split in column the same as ColumnParallelLoRALinear. self.lora_B = self.create_parameter( shape=[r, self.output_size_per_partition // len(enable_lora) * sum(enable_lora)], @@ -385,6 +390,8 @@ def __init__( is_bias=False, default_initializer=nn.initializer.Constant(value=0.0), ) + self.lora_B.is_distributed = True + self.lora_B.split_axis = 1 self.scaling = self.lora_alpha / self.r # Freezing the pre-trained weight matrix @@ -444,11 +451,12 @@ def forward(self, input: paddle.Tensor): # [batch_size, *, out_features_per_partition] result_mp = F.linear(x=input_mp, weight=self.weight, bias=self.bias, name=self.name) if self.r > 0 and any(self.enable_lora) and not self.merged: - input_a = self.lora_dropout(input_mp) @ self.lora_A + input_a = self.lora_dropout(input) @ self.lora_A + input_a_mp = mp_ops._c_identity(input_a, group=self.model_parallel_group) if input_a.dim() == 3: delta_mp = ( F.conv1d( - input_a.transpose([0, 2, 1]), + input_a_mp.transpose([0, 2, 1]), self.lora_B.T.unsqueeze(-1), groups=sum(self.enable_lora), )