PaddlePaddle · wawltor · Mar 26, 2024 · Nov 8, 2023 · Jan 8, 2024 · Feb 27, 2024
diff --git a/paddlenlp/transformers/LongSequenceStrategies/AttentionStrategies.py b/paddlenlp/transformers/LongSequenceStrategies/AttentionStrategies.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import paddle
+from paddle import Tensor, nn
+
+__all__ = ["AttentionWithLinearBias"]
+
+
+class AttentionWithLinearBias(nn.Layer):
+    def __init__(self, **init_args):
+        super().__init__()
+
+    def _get_interleave(self, n):
+        def _get_interleave_power_of_2(n):
+            start = 2 ** (-(2 ** -(math.log2(n) - 3)))
+            return [start * start**i for i in range(n)]
+
+        if math.log2(n).is_integer():
+            return _get_interleave_power_of_2(n)
+        else:
+            closest_power_of_2 = 2 ** math.floor(math.log2(n))
+            return (
+                _get_interleave_power_of_2(closest_power_of_2)
+                + self._get_interleave(2 * closest_power_of_2)[0::2][: n - closest_power_of_2]
+            )
+
+    def forward(self, bool_attention_mask: Tensor, num_heads: int, dtype: paddle.dtype):
+        attention_mask = bool_attention_mask.astype("float32")
+        batch_size, seq_length = attention_mask.shape[0], attention_mask.shape[-1]
+        slopes = paddle.to_tensor(self._get_interleave(num_heads), dtype="float32")
+        alibi = slopes.unsqueeze(axis=[1, 2]) * paddle.arange(seq_length, dtype="float32").unsqueeze(
+            axis=[0, 1]
+        ).expand([num_heads, -1, -1])
+        alibi = alibi.reshape(shape=(1, num_heads, 1, seq_length)).expand([batch_size, -1, -1, -1])
+        return paddle.cast(alibi, dtype)
diff --git a/paddlenlp/transformers/LongSequenceStrategies/EmbeddingStrategies.py b/paddlenlp/transformers/LongSequenceStrategies/EmbeddingStrategies.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import nn
+
+__all__ = [
+    "RotaryEmbedding",
+    "LinearScalingRotaryEmbedding",
+    "NTKScalingRotaryEmbedding",
+    "DynamicNTKScalingRotaryEmbedding",
+]
+
+
+class RotaryEmbedding(nn.Layer):
+    def __init__(self, **init_args):
+        super().__init__()
+        self.dim = init_args["dim"]
+        self.max_position_embeddings = init_args["max_position_embeddings"]
+        self.base = init_args["base"]
+        self.position_encoding_2d = init_args["position_encoding_2d"] if "position_encoding_2d" in init_args else False
+        if self.position_encoding_2d:
+            # [dim / 4]# 2D--Embedding
+            self.dim = self.dim / 2
+            inv_freq = 1.0 / (
+                self.base ** (paddle.cast(paddle.arange(0, self.dim, 2), dtype=paddle.float32) / self.dim)
+            )
+        else:
+            # [dim / 2]
+            inv_freq = 1.0 / (
+                self.base ** (paddle.cast(paddle.arange(0, self.dim, 2), dtype=paddle.float32) / self.dim)
+            )
+        self.register_buffer("inv_freq", inv_freq)
+        self._set_cos_sin_cache(seq_len=self.max_position_embeddings)
+
+    def _set_cos_sin_cache(self, seq_len):
+        self.max_seq_len_cached = seq_len
+        # [seq_len]
+        t = paddle.arange(seq_len, dtype=paddle.float32)
+        # [seq_len, dim/2]
+        with paddle.amp.auto_cast(enable=False):
+            freqs = paddle.outer(t.astype(self.inv_freq.dtype), self.inv_freq)
+        # [seq_len, dim]
+        emb = paddle.concat([freqs, freqs], axis=-1)
+        self.cos_cached = emb.cos()[:, :]
+        self.sin_cached = emb.sin()[:, :]
+
+    def forward(self, seq_len=None, ntk_alpha=None):
+
+        return self.cos_cached[:, :], self.sin_cached[:, :]
+
+
+class LinearScalingRotaryEmbedding(RotaryEmbedding):
+    def __init__(self, **init_args):
+        self.scaling_factor = init_args["scaling_factor"]
+        super().__init__(**init_args)
+
+    def _set_cos_sin_cache(self, seq_len):
+        self.max_seq_len_cached = seq_len
+        # [seq_len]
+        t = paddle.arange(seq_len, dtype=paddle.float32)
+        t = t / self.scaling_factor
+        # [seq_len, dim/2]
+        with paddle.amp.auto_cast(enable=False):
+            freqs = paddle.outer(t.astype(self.inv_freq.dtype), self.inv_freq)
+        # [seq_len, dim]
+        emb = paddle.concat([freqs, freqs], axis=-1)
+        self.cos_cached = emb.cos()[:, :]
+        self.sin_cached = emb.sin()[:, :]
+
+
+class NTKScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with NTK scaling. https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/"""
+
+    def __init__(self, **init_args):
+        init_args["base"] = init_args["base"] * init_args["scaling_factor"] ** (
+            init_args["dim"] / (init_args["dim"] - 2)
+        )
+        super().__init__(**init_args)
+
+
+class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with Dynamic NTK scaling. https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/"""
+
+    def __init__(self, **init_args):
+        self.scaling_factor = init_args["scaling_factor"]
+        self._seq_len_cached = 0
+        super().__init__(**init_args)
+
+    def _scale_cos_sin(self, seq_len, ntk_alpha=None):
+        # [seq_len]
+        t = paddle.arange(seq_len, dtype=paddle.float32)
+        if ntk_alpha is None:
+            ntk_alpha = (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+        base = self.base * ntk_alpha ** (self.dim / (self.dim - 2))
+
+        # [seq_len, dim/2]
+        inv_freq = 1.0 / (base ** (paddle.cast(paddle.arange(0, self.dim, 2), dtype=paddle.float32) / self.dim))
+        with paddle.amp.auto_cast(enable=False):
+            freqs = paddle.outer(t.astype(inv_freq.dtype), inv_freq)
+        # [seq_len, dim]
+        emb = paddle.concat([freqs, freqs], axis=-1)
+        self.cos_cached = emb.cos()[:, :]
+        self.sin_cached = emb.sin()[:, :]
+
+    def forward(self, seq_len=None, ntk_alpha=None):
+
+        if seq_len > self.max_position_embeddings:
+            self._scale_cos_sin(seq_len=seq_len, ntk_alpha=ntk_alpha)
+
+        return super().forward(seq_len)
diff --git a/paddlenlp/transformers/LongSequenceStrategies/LongSequenceStrategies.py b/paddlenlp/transformers/LongSequenceStrategies/LongSequenceStrategies.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+
+all_strategy_types = ["EmbeddingStrategies", "AttentionStrategies"]
+
+
+class LongSequenceStrategies:
+    @classmethod
+    def build_long_sequence_strategy(cls, strategy_type=None, stratety_name=None, **init_args):
+        """
+
+        **init_args:   head_dim,
+                       max_position_embeddings,
+                       rope_scaling_type,
+                       rope_scaling_factor,
+                       ...
+
+        strategy_type: "None" ---------------走原始的build-in模块
+                       "EmbeddingStrategies"、
+                       "AttentionStrategies"
+                       ...
+
+        stratety_name: "RotaryEmbedding"、
+                       "LinearScalingRotaryEmbedding"、
+                       "NTKScalingRotaryEmbedding"、
+                       "DynamicNTKScalingRotaryEmbedding"、
+                       "AttentionWithLinearBias"
+                       ...
+
+        """
+
+        """
+        paddlenlp.transformers.LongSequenceStrategies.{strategy_type<->import_class)}.{stratety_name<->strategy_class)}
+        paddlenlp.transformers.LongSequenceStrategies.{EmbeddingStrategies}.{RoPE,...}
+        paddlenlp.transformers.LongSequenceStrategies.{AttentionStrategies}.{ALiBi,...}
+        """
+        try:
+            import_class = importlib.import_module(f"paddlenlp.transformers.LongSequenceStrategies.{strategy_type}")
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError(
+                f"Wrong strategy type {strategy_type}. module only supports the following types: "
+                + ", ".join(m for m in all_strategy_types)
+            )
+        try:
+            strategy_class = getattr(import_class, stratety_name)
+        except:
+            all_strategy_classes = import_class.__all__
+            raise LookupError(
+                f"module '{import_class.__name__}' only supports the following classes: "
+                + ", ".join(m for m in all_strategy_classes)
+            )
+        strategy_instance = strategy_class(**init_args)
+        return strategy_instance
diff --git a/paddlenlp/transformers/LongSequenceStrategies/__init__.py b/paddlenlp/transformers/LongSequenceStrategies/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from .AttentionStrategies import *
+from .EmbeddingStrategies import *
+from .LongSequenceStrategies import *
diff --git a/paddlenlp/transformers/bloom/configuration.py b/paddlenlp/transformers/bloom/configuration.py
@@ -125,6 +125,10 @@ def __init__(
         use_recompute=False,
         use_pure_fp16=False,
         use_flash_attention=False,
+        long_sequence_strategy_type=None,
+        long_sequence_strategy_name=None,
+        long_sequence_init_args=None,
+        use_long_sequence_strategies=False,
         **kwargs,
     ):
 
@@ -150,3 +154,8 @@ def __init__(
         self.use_recompute = use_recompute
         self.use_pure_fp16 = use_pure_fp16
         self.use_flash_attention = use_flash_attention
+
+        self.long_sequence_strategy_type = long_sequence_strategy_type
+        self.long_sequence_strategy_name = long_sequence_strategy_name
+        self.long_sequence_init_args = {} if long_sequence_init_args is None else long_sequence_init_args
+        self.use_long_sequence_strategies = use_long_sequence_strategies
diff --git a/paddlenlp/transformers/bloom/modeling.py b/paddlenlp/transformers/bloom/modeling.py
@@ -26,6 +26,7 @@
 from paddle.distributed import fleet
 from paddle.distributed.fleet.utils import recompute
 
+from paddlenlp.transformers.LongSequenceStrategies import LongSequenceStrategies
 from paddlenlp.transformers.model_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     CausalLMOutputWithCrossAttentions,
@@ -944,10 +945,27 @@
             attention_mask = paddle.cast(attention_mask, "bool")
         if len(attention_mask.shape) > 2:
             _attention_mask = paddle.ones([batch_size, seq_length_with_past], dtype="bool")
-            alibi = build_alibi_tensor(_attention_mask, self.config.n_head, dtype=hidden_states.dtype)
+            if self.config.use_long_sequence_strategies:
+                alibi_layer = LongSequenceStrategies.build_long_sequence_strategy(
+                    self.config.long_sequence_strategy_type,
+                    self.config.long_sequence_strategy_name,
+                    **self.config.long_sequence_init_args,
+                )
+                alibi = alibi_layer(_attention_mask, self.config.n_head, dtype=hidden_states.dtype)
+                alibi = paddle.squeeze(alibi)
+            else:
+                alibi = build_alibi_tensor(_attention_mask, self.config.n_head, dtype=hidden_states.dtype)
         else:
-            alibi = build_alibi_tensor(attention_mask, self.config.n_head, dtype=hidden_states.dtype)
-
+            if self.config.use_long_sequence_strategies:
+                alibi_layer = LongSequenceStrategies.build_long_sequence_strategy(
+                    self.config.long_sequence_strategy_type,
+                    self.config.long_sequence_strategy_name,
+                    **self.config.long_sequence_init_args,
+                )
+                alibi = alibi_layer(attention_mask, self.config.n_head, dtype=hidden_states.dtype)
+                alibi = paddle.squeeze(alibi)
+            else:
+                alibi = build_alibi_tensor(attention_mask, self.config.n_head, dtype=hidden_states.dtype)
         if self.config.tensor_parallel_degree > 1:
             block_size = self.config.n_head // self.config.tensor_parallel_degree
             alibi = alibi[

diff --git a/paddlenlp/transformers/chatglm/configuration.py b/paddlenlp/transformers/chatglm/configuration.py
@@ -21,7 +21,6 @@
     "CHATGLM_PRETRAINED_RESOURCE_FILES_MAP",
 ]
 
-
 CHATGLM_PRETRAINED_RESOURCE_FILES_MAP = {
     "model_state": {
         "THUDM/chatglm-6b": "https://paddlenlp.bj.bcebos.com/models/community/THUDM/chatglm-6b/model_state.pdparams",
@@ -104,6 +103,10 @@ def __init__(
         activation="gelu",
         num_image_tokens=0,
         use_flash_attention=False,
+        long_sequence_strategy_type=None,
+        long_sequence_strategy_name=None,
+        long_sequence_init_args=None,
+        use_long_sequence_strategies=False,
         **kwargs
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
@@ -129,3 +132,7 @@ def __init__(
         self.activation = activation
         self.num_image_tokens = num_image_tokens
         self.use_flash_attention = use_flash_attention
+        self.long_sequence_strategy_type = long_sequence_strategy_type
+        self.long_sequence_strategy_name = long_sequence_strategy_name
+        self.long_sequence_init_args = {} if long_sequence_init_args is None else long_sequence_init_args
+        self.use_long_sequence_strategies = use_long_sequence_strategies