Skip to content

Commit

Permalink
Merge branch 'adapter-hub:main' into implement_vera
Browse files Browse the repository at this point in the history
  • Loading branch information
julian-fong authored Jan 2, 2025
2 parents 7f79832 + f0ca962 commit 470169f
Show file tree
Hide file tree
Showing 13 changed files with 565 additions and 270 deletions.
4 changes: 4 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,11 @@ def pytest_configure(config):
config.addinivalue_line(
"markers", "is_pt_flax_cross_test: mark test to run only when PT and FLAX interactions are tested"
)
config.addinivalue_line("markers", "is_pipeline_test: mark test to run only when pipelines are tested")
config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment")
config.addinivalue_line("markers", "accelerate_tests: mark test that require accelerate")
config.addinivalue_line("markers", "agent_tests: mark the agent tests that are run on their specific schedule")
config.addinivalue_line("markers", "not_device_test: mark the tests always running on cpu")


def pytest_addoption(parser):
Expand Down
2 changes: 1 addition & 1 deletion hf_transformers
Submodule hf_transformers updated 892 files
8 changes: 8 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
[tool.black]
line-length = 119
target-version = ['py38', 'py39', 'py310']

# copied from HF for testing
[tool.pytest.ini_options]
markers = [
"flash_attn_test: marks tests related to flash attention (deselect with '-m \"not flash_attn_test\"')",
"bitsandbytes: select (or deselect with `not`) bitsandbytes integration tests",
"generate: marks tests that use the GenerationTesterMixin"
]
6 changes: 4 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
"isort>=5.5.4",
"Jinja2==2.11.3",
"nltk",
"packaging",
"parameterized",
"pillow",
"protobuf",
Expand All @@ -60,7 +61,7 @@
"timeout-decorator",
"torch",
"torchvision",
"transformers~=4.45.2",
"transformers~=4.46.3",
]


Expand Down Expand Up @@ -136,11 +137,12 @@ def deps_list(*pkgs):
# when modifying the following list, make sure to update src/transformers/dependency_versions_check.py
install_requires = [
deps["transformers"],
deps["packaging"],
]

setup(
name="adapters",
version="1.0.1",
version="1.1.0.dev0",
author="The AdapterHub team and community contributors",
author_email="[email protected]",
description="A Unified Library for Parameter-Efficient and Modular Transfer Learning",
Expand Down
2 changes: 1 addition & 1 deletion src/adapters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

__version__ = "1.0.1"
__version__ = "1.1.0.dev0"

from typing import TYPE_CHECKING

Expand Down
8 changes: 8 additions & 0 deletions src/adapters/head_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -705,6 +705,14 @@
},
"layers": [None, "score"],
},
"MistralForQuestionAnswering": {
"config": {
"head_type": "question_answering",
"layers": 1,
"activation_function": None,
},
"layers": [None, "qa_outputs"],
},
# Electra
"ElectraForTokenClassification": {
"config": {
Expand Down
6 changes: 2 additions & 4 deletions src/adapters/heads/model_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,10 +139,8 @@ def tie_weights(self):

super().tie_weights()

def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None):
old_embeddings = self.get_input_embeddings()
new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens, pad_to_multiple_of)
self.set_input_embeddings(new_embeddings)
def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None, mean_resizing=True):
super()._resize_token_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing)

# if word embeddings are not tied, make sure that lm head is resized as well
if not self.config.tie_word_embeddings:
Expand Down
20 changes: 20 additions & 0 deletions src/adapters/loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from typing import Callable, Mapping, Optional, Sequence, Tuple

import torch
from packaging.version import Version


try:
Expand Down Expand Up @@ -368,6 +369,23 @@ def _rename_legacy_weights(self, k):
k = k.replace(old, new)
return k

def _fix_backward_compat(self, config):
# Fix error in previous versions for LoRA/ (IA)^3
ADAPTER_PREFIX = "adapters."
MIN_VERSION = Version("1.1.0")

version = config.get("version", "")
if version.startswith(ADAPTER_PREFIX) and Version(version[len(ADAPTER_PREFIX) :]) < MIN_VERSION:
if (
config["config"].get("architecture", None) == "lora"
and config["config"]["r"] != config["config"]["alpha"]
):
logger.warning(
"Loading a LoRA trained using a faulty scaling implementation of a previous library version. Editing the configuration to make sure the adapter works as trained."
"See https://github.com/adapter-hub/adapters/pull/770 for more."
)
config["config"]["alpha"] = config["config"]["r"]

# This method is used to remove unnecessary invertible adapters from task adapters using the old format.
# In the old format, task adapters e.g. using seq_bn config specify inv. adapters but don't use them.
# As inv. adapters would be incorrectly used in the new implementation,
Expand Down Expand Up @@ -560,6 +578,8 @@ def load(
# The conversion to a set and then back to a list removes all duplicates
leave_out = list(set(leave_out + config["config"]["leave_out"]))
config["config"]["leave_out"] = leave_out
# Fix issues
self._fix_backward_compat(config)

adapter_name = load_as or config["name"]
# If the adapter is not part of the model, add it
Expand Down
2 changes: 2 additions & 0 deletions src/adapters/methods/lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ def forward(self, hidden_states: Optional[torch.Tensor], layer_input: torch.Tens
hidden_states = hidden_states * gate
else:
gate = None
hidden_states = hidden_states * self.scaling

return hidden_states, gate

Expand Down Expand Up @@ -179,6 +180,7 @@ def forward(self, hidden_states: Optional[torch.Tensor], layer_input: torch.Tens
hidden_states = hidden_states * gate
else:
gate = None
hidden_states = hidden_states * self.scaling

return hidden_states, gate

Expand Down
187 changes: 184 additions & 3 deletions src/adapters/models/distilbert/modeling_distilbert.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,26 @@
import torch
from torch import nn

from transformers.models.distilbert.modeling_distilbert import MultiHeadSelfAttention, TransformerBlock
from transformers.models.distilbert.modeling_distilbert import (
DistilBertFlashAttention2,
DistilBertSdpaAttention,
MultiHeadSelfAttention,
TransformerBlock,
)
from transformers.utils import is_flash_attn_2_available, logging

from ...composition import adjust_tensors_for_parallel, adjust_tensors_for_parallel_, match_attn_matrices_for_parallel
from ...utils import prefix_attention_mask
from .mixin_distilbert import DistilBertMultiHeadSelfAttentionMixin, DistilBertTransfomerBlockAdaptersMixin


if is_flash_attn_2_available():
from transformers.modeling_flash_attention_utils import _flash_attention_forward


logger = logging.get_logger(__name__)


class MultiHeadSelfAttentionWithAdapters(DistilBertMultiHeadSelfAttentionMixin, MultiHeadSelfAttention):
def forward(
self,
Expand Down Expand Up @@ -66,18 +79,20 @@ def shape(x: torch.Tensor) -> torch.Tensor:

def unshape(x: torch.Tensor) -> torch.Tensor:
"""group heads"""
return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)
return x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.n_heads * dim_per_head)

q = shape(self.q_lin(query)) # (bs, n_heads, q_length, dim_per_head)
k = shape(self.k_lin(key)) # (bs, n_heads, k_length, dim_per_head)
v = shape(self.v_lin(value)) # (bs, n_heads, k_length, dim_per_head)

# >>> START AH Changes <<<
q, k, v = match_attn_matrices_for_parallel(q, k, v)
(mask,) = adjust_tensors_for_parallel(q, mask)

k, v, mask = self.prefix_tuning(k, v, value, mask, invert_mask=False)
bs = k.size(0) # reset for Parallel block
(q,) = adjust_tensors_for_parallel(k, q)
# >>> END AH Changes <<<

mask_reshp = (bs, 1, 1, k.size(2))

Expand Down Expand Up @@ -105,6 +120,172 @@ def unshape(x: torch.Tensor) -> torch.Tensor:
return (context,)


class DistilBertSdpaAttentionWithAdapters(DistilBertMultiHeadSelfAttentionMixin, DistilBertSdpaAttention):
def forward(
self,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
mask: torch.Tensor,
head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
) -> Tuple[torch.Tensor, ...]:
"""
Parameters:
query: torch.tensor(bs, seq_length, dim)
key: torch.tensor(bs, seq_length, dim)
value: torch.tensor(bs, seq_length, dim)
mask: torch.tensor(bs, seq_length)
Returns:
weights: torch.tensor(bs, n_heads, seq_length, seq_length) Attention weights context: torch.tensor(bs,
seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True`
"""
if output_attentions or head_mask is not None:
logger.warning_once(
"DistilBertSdpaAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support"
" `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but specifying"
" the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be"
' removed using the argument `attn_implementation="eager"` when loading the model.'
)
return super().forward(
query,
key,
value,
mask,
head_mask,
output_attentions,
)

batch_size, _, _ = query.size()
dim_per_head = self.dim // self.n_heads

def shape(x: torch.Tensor) -> torch.Tensor:
"""separate heads"""
# keep first dim due to parallel composition
return x.view(x.shape[0], -1, self.n_heads, dim_per_head).transpose(1, 2)

def unshape(x: torch.Tensor) -> torch.Tensor:
"""group heads"""
return x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.n_heads * dim_per_head)

q = shape(self.q_lin(query)) # (bs, n_heads, q_length, dim_per_head)
k = shape(self.k_lin(key)) # (bs, n_heads, k_length, dim_per_head)
v = shape(self.v_lin(value)) # (bs, n_heads, k_length, dim_per_head)

# >>> START AH Changes <<<
q, k, v = match_attn_matrices_for_parallel(q, k, v)
(mask,) = adjust_tensors_for_parallel(q, mask)

k, v, mask = self.prefix_tuning(k, v, value, mask, invert_mask=False)
(q,) = adjust_tensors_for_parallel(k, q)
# >>> END AH Changes <<<

# SDPA with memory-efficient backend is broken in torch==2.1.2 when using non-contiguous inputs and a custom
# attn_mask, so we need to call `.contiguous()` here. This was fixed in torch==2.2.0.
# Reference: https://github.com/pytorch/pytorch/issues/112577
if self.require_contiguous_qkv and q.device.type == "cuda" and mask is not None:
q = q.contiguous()
k = k.contiguous()
v = v.contiguous()

attn_output = torch.nn.functional.scaled_dot_product_attention(
q,
k,
v,
attn_mask=mask,
dropout_p=self.dropout_prob if self.training else 0.0,
is_causal=False,
)

attn_output = unshape(attn_output)
attn_output = self.out_lin(attn_output)

return (attn_output,)


class DistilBertFlashAttention2WithAdapters(DistilBertMultiHeadSelfAttentionMixin, DistilBertFlashAttention2):
def forward(
self,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
mask: torch.Tensor,
head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
) -> Tuple[torch.Tensor, ...]:
"""
Parameters:
query: torch.tensor(bs, seq_length, dim)
key: torch.tensor(bs, seq_length, dim)
value: torch.tensor(bs, seq_length, dim)
mask: torch.tensor(bs, seq_length)
Returns:
weights: torch.tensor(bs, n_heads, seq_length, seq_length) Attention weights context: torch.tensor(bs,
seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True`
"""
batch_size, q_length, dim = query.size()

dim_per_head = self.dim // self.n_heads

def reshape(x: torch.Tensor) -> torch.Tensor:
"""separate heads"""
return x.view(x.shape[0], -1, self.n_heads, dim_per_head)

# Flash attention requires the input to have the shape
# batch_size x seq_length x head_dim x hidden_dim
query_states = reshape(self.q_lin(query))
key_states = reshape(self.k_lin(key))
value_states = reshape(self.v_lin(value))

attn_dropout = self.config.attention_dropout if self.training else 0.0

# In PEFT, usually we cast the layer norms in float32 for training stability reasons
# therefore the input hidden states gets silently casted in float32. Hence, we need
# cast them back in the correct dtype just to be sure everything works as expected.
# This might slowdown training & inference so it is recommended to not cast the LayerNorms
# in fp32. (LlamaRMSNorm handles it correctly)

if query_states.dtype == torch.float32:
if torch.is_autocast_enabled():
target_dtype = torch.get_autocast_gpu_dtype()
# Handle the case where the model is quantized
elif hasattr(self.config, "_pre_quantization_dtype"):
target_dtype = self.config._pre_quantization_dtype
else:
target_dtype = self.q_lin.weight.dtype

logger.warning_once(
f"The input hidden states seems to be silently casted in float32, this might be related to"
f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
f" {target_dtype}."
)

query_states = query_states.to(target_dtype)
key_states = key_states.to(target_dtype)
value_states = value_states.to(target_dtype)

attn_weights = _flash_attention_forward(
query_states,
key_states,
value_states,
mask,
q_length,
dropout=attn_dropout,
use_top_left_mask=self._flash_attn_uses_top_left_mask,
is_causal=self.is_causal,
)

attn_weights_reshaped = attn_weights.reshape(batch_size, q_length, self.n_heads * dim_per_head)
attn_output = self.out_lin(attn_weights_reshaped)

if output_attentions:
return (attn_output, attn_weights)
else:
return (attn_output,)


class TransformerBlockWithAdapters(DistilBertTransfomerBlockAdaptersMixin, TransformerBlock):
def forward(
self,
Expand All @@ -123,7 +304,7 @@ def forward(
torch.tensor(bs, seq_length, dim) The output of the transformer block contextualization.
"""
adjust_tensors_for_parallel_(x, attn_mask)
attn_mask = prefix_attention_mask(attn_mask, dim=1, prefix_value=1) # type: ignore
attn_mask = prefix_attention_mask(attn_mask, dim=[2, 3], prefix_value=1) # type: ignore

# Self-Attention
sa_output = self.attention(
Expand Down
Loading

0 comments on commit 470169f

Please sign in to comment.