Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
e6980cb
Update `setup.py` and examples to Transformers v4.51
regisss Apr 18, 2025
9786545
Upgrade `generation/utils.py` to v4.51.3 (except `_beam_search`, will…
regisss Apr 18, 2025
0cd573e
Upgrade to Transformers v4.51
regisss Apr 21, 2025
34f32a9
Fixes
regisss Apr 21, 2025
94be78d
Merge branch 'main' into transformers_future
regisss Apr 21, 2025
25b053c
Fix AWQ requirements
regisss Apr 22, 2025
ef27edc
Merge branch 'main' into transformers_future
regisss Apr 30, 2025
071fb63
Beam search
regisss Apr 30, 2025
b4e3ed3
Make style
regisss Apr 30, 2025
831bfae
Text-generation fix
regisss Apr 30, 2025
0947f5b
1x tests fix
regisss May 5, 2025
85ec584
Merge branch 'main' into transformers_future
regisss May 8, 2025
8b2e896
Fix 8x tests
regisss May 8, 2025
7ef7282
Merge branch 'main' into transformers_future
regisss May 20, 2025
452b66a
Merge branch 'main' into transformers_future
regisss May 22, 2025
762c08f
Fix 8x tests
regisss May 22, 2025
1c91d3f
Fix mllama test
regisss May 29, 2025
4636aef
Merge branch 'main' into transformers_future
IlyasMoutawwakil Jun 11, 2025
31bf07e
Fix CI tests
regisss May 30, 2025
3805cea
Merge branch 'main' into transformers_future
regisss Jun 13, 2025
01dc42d
Merge branch 'main' into transformers_future
regisss Jun 23, 2025
c0893c2
Merge branch 'main' into transformers_future
regisss Jun 26, 2025
f0a5945
Merge branch 'main' into transformers_future
regisss Aug 11, 2025
3515a12
Update examples
regisss Aug 11, 2025
d4960e7
generation + integrations + loss folders
regisss Aug 11, 2025
91e29c4
models
regisss Aug 13, 2025
5b39327
trainer + tests
regisss Aug 13, 2025
5450d7a
Fix tests + comment remote models (to fix later)
regisss Aug 18, 2025
f8aa93a
Fix decilm
regisss Aug 18, 2025
f1de537
Fix deepseek_v2
regisss Aug 18, 2025
4d0d67c
finish model updates
regisss Aug 19, 2025
41fc61c
Upgrade to v4.55.2
regisss Aug 19, 2025
1316366
Merge branch 'main' into transformers_future
regisss Aug 21, 2025
e0c2e5b
Fix baselines
regisss Aug 28, 2025
749085e
Merge branch 'main' into transformers_future
regisss Sep 16, 2025
336a049
Fix baselines 2
regisss Sep 8, 2025
bbd6c5f
Merge branch 'main' into transformers_future
regisss Sep 25, 2025
5d1243c
Revert matmul change for OPT
regisss Sep 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
from transformers.utils import logging

from ...generation.utils import GaudiGenerationMixin
from ...modeling_attn_mask_utils import _gaudi_prepare_4d_causal_attention_mask
from .configuration_baichuan import BaichuanConfig
from .generation_utils import TextIterStreamer, build_chat_input
Expand Down Expand Up @@ -1163,7 +1164,7 @@ def no_init_weights(_enable=True):
_init_weights = old_init_weights


class BaichuanForCausalLM(BaichuanPreTrainedModel):
class BaichuanForCausalLM(BaichuanPreTrainedModel, GaudiGenerationMixin):
def __init__(self, config, *model_args, **model_kwargs):
super().__init__(config, *model_args, **model_kwargs)
self.model = BaichuanModel(config)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from transformers.utils import logging

from ....utils import warn0
from ...generation.utils import GaudiGenerationMixin
from ...modeling_attn_mask_utils import _gaudi_prepare_4d_causal_attention_mask
from .configuration_chatglm import ChatGLMConfig

Expand Down Expand Up @@ -1347,7 +1348,7 @@ def forward(
)


class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel, GaudiGenerationMixin):
def __init__(self, config: ChatGLMConfig, empty_init=False, device=None):
super().__init__(config)

Expand Down
6 changes: 4 additions & 2 deletions optimum/habana/transformers/models/gemma/modeling_gemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -661,7 +661,7 @@ def forward(
):
htcore.mark_step()

hidden_states = decoder_layer(
layer_outputs = decoder_layer(
hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
Expand All @@ -678,8 +678,10 @@ def forward(
**kwargs,
)

hidden_states = layer_outputs[0]

if use_cache:
next_decoder_cache += (hidden_states[1],)
next_decoder_cache += (layer_outputs[1],)

hidden_states = self.norm(hidden_states)

Expand Down
17 changes: 9 additions & 8 deletions optimum/habana/transformers/models/minicpm/modeling_minicpm.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
from transformers.utils.import_utils import is_torch_fx_available

from ....utils import warn0
from ...generation.utils import GaudiGenerationMixin
from .configuration_minicpm import MiniCPM3Config


Expand Down Expand Up @@ -505,8 +506,7 @@ def forward(
value_states = past_value_states.index_add(
-2, token_idx - 1, value_states - torch.index_select(past_value_states, -2, token_idx - 1)
)
past_key_value.key_cache[self.layer_idx] = key_states
past_key_value.value_cache[self.layer_idx] = value_states
past_key_value.update(key_states, value_states, self.layer_idx)

attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.softmax_scale

Expand Down Expand Up @@ -644,8 +644,7 @@ def forward(
value_states = past_value_states.index_add(
-2, token_idx - 1, value_states - torch.index_select(past_value_states, -2, token_idx - 1)
)
past_key_value.key_cache[self.layer_idx] = key_states
past_key_value.value_cache[self.layer_idx] = value_states
past_key_value.update(key_states, value_states, self.layer_idx)

# TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
# to be able to avoid many of these transpose/reshape/view.
Expand Down Expand Up @@ -854,7 +853,7 @@ def forward(
"for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
"with a layer index."
)
usable_length = past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
usable_length = past_key_value.get_seq_length(self.layer_idx)
if token_idx is None:
kv_seq_len += usable_length
elif usable_length > 0:
Expand Down Expand Up @@ -1023,7 +1022,9 @@ def forward(
"The bare MiniCPM Model outputting raw hidden-states without any specific head on top.",
MINICPM_START_DOCSTRING,
)
class MiniCPM3PreTrainedModel(PreTrainedModel):
class MiniCPM3PreTrainedModel(
PreTrainedModel,
):
config_class = MiniCPM3Config
base_model_prefix = "model"
supports_gradient_checkpointing = True
Expand Down Expand Up @@ -1200,7 +1201,7 @@ def forward(
use_legacy_cache = not isinstance(past_key_values, Cache)
if use_legacy_cache:
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
past_key_values_length = past_key_values.get_usable_length(seq_length)
past_key_values_length = past_key_values.get_seq_length()

if position_ids is None:
device = input_ids.device if input_ids is not None else inputs_embeds.device
Expand Down Expand Up @@ -1292,7 +1293,7 @@ def forward(
)


class MiniCPM3ForCausalLM(MiniCPM3PreTrainedModel):
class MiniCPM3ForCausalLM(MiniCPM3PreTrainedModel, GaudiGenerationMixin):
_tied_weights_keys = ["lm_head.weight"]

def __init__(self, config):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,7 @@ def forward(
hidden_states = self.input_layernorm(hidden_states)

# Self Attention
hidden_states, self_attn_weights, present_key_value = self.self_attn(
hidden_states, _, present_key_value = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,11 @@ def forward(
"for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
"with a layer index."
)
if token_idx is not None and past_key_value.get_usable_length(kv_seq_len, self.layer_idx) > 0:
if token_idx is not None and past_key_value.get_seq_length(kv_seq_len, self.layer_idx) > 0:
# When token_idx is used, static seq len = (input token len + max output token len)
kv_seq_len = past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
kv_seq_len = past_key_value.get_seq_length(kv_seq_len, self.layer_idx)
else:
kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
kv_seq_len += past_key_value.get_seq_length(kv_seq_len, self.layer_idx)
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)

# Partial rotary embedding
Expand All @@ -98,14 +98,16 @@ def forward(

if past_key_value is not None:
if token_idx is not None:
if 0 <= self.layer_idx < len(past_key_value.key_cache):
past_key_value.key_cache[self.layer_idx].index_copy_(2, token_idx - 1, key_states)
past_key_value.value_cache[self.layer_idx].index_copy_(2, token_idx - 1, value_states)
key_states = past_key_value.key_cache[self.layer_idx]
value_states = past_key_value.value_cache[self.layer_idx]
if (
0 <= self.layer_idx < len(past_key_value)
and past_key_value.layers[self.layer_idx].keys is not None
):
past_key_value.layers[self.layer_idx].keys.index_copy_(2, token_idx - 1, key_states)
past_key_value.layers[self.layer_idx].values.index_copy_(2, token_idx - 1, value_states)
key_states = past_key_value.layers[self.layer_idx].keys
value_states = past_key_value.layers[self.layer_idx].values
else:
past_key_value.key_cache.append(key_states)
past_key_value.value_cache.append(value_states)
past_key_value.update(key_states, value_states, self.layer_idx)
else:
# Specific to RoPE models with partial rotation
cache_kwargs = {
Expand Down
22 changes: 12 additions & 10 deletions optimum/habana/transformers/models/stablelm/modeling_stablelm.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,11 +71,11 @@ def forward(
"for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
"with a layer index."
)
if token_idx is not None and past_key_value.get_usable_length(kv_seq_len, self.layer_idx) > 0:
if token_idx is not None and past_key_value.get_seq_length(self.layer_idx) > 0:
# When token_idx is used, static seq len = (input token len + max output token len)
kv_seq_len = past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
kv_seq_len = past_key_value.get_seq_length(self.layer_idx)
else:
kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
kv_seq_len += past_key_value.get_seq_length(self.layer_idx)
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)

# Partial rotary embedding
Expand All @@ -96,14 +96,16 @@ def forward(

if past_key_value is not None:
if token_idx is not None:
if 0 <= self.layer_idx < len(past_key_value.key_cache):
past_key_value.key_cache[self.layer_idx].index_copy_(2, token_idx - 1, key_states)
past_key_value.value_cache[self.layer_idx].index_copy_(2, token_idx - 1, value_states)
key_states = past_key_value.key_cache[self.layer_idx]
value_states = past_key_value.value_cache[self.layer_idx]
if (
0 <= self.layer_idx < len(past_key_value)
and past_key_value.layers[self.layer_idx].keys is not None
):
past_key_value.layers[self.layer_idx].keys.index_copy_(2, token_idx - 1, key_states)
past_key_value.layers[self.layer_idx].values.index_copy_(2, token_idx - 1, value_states)
key_states = past_key_value.layers[self.layer_idx].keys
value_states = past_key_value.layers[self.layer_idx].values
else:
past_key_value.key_cache.append(key_states)
past_key_value.value_cache.append(value_states)
past_key_value.update(key_states, value_states, self.layer_idx)
else:
# Specific to RoPE models with partial rotation
cache_kwargs = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@
"throughput": 109.70751574382221
},
"gaudi3": {
"output": "DeepSpeed is a machine learning framework that enables training of large-scale models on commodity hardware. It is designed to be a drop-in replacement for PyTorch, and it is compatible with the existing PyTorch ecosystem. DeepSpeed is designed to be easy to use, and it provides a number of features that make it easy to train large-scale models. DeepSpeed is designed to be scalable, and it can be used to train models on a single machine or on a cluster of machines. DeepSpeed is designed to be efficient,",
"output": "DeepSpeed is a machine learning framework that enables training of large-scale models on commodity hardware. It is designed to be a drop-in replacement for PyTorch and is compatible with existing PyTorch code. DeepSpeed is open source and available on GitHub.\n\nDeepSpeed is a machine learning framework that enables training of large-scale models on commodity hardware. It is designed to be a drop-in replacement for PyTorch and is compatible with existing PyTorch code. DeepSpeed is open source and available on GitHub.\n\n<h2>What is",
"throughput": 135.97272017864475
}
},
Expand Down Expand Up @@ -415,7 +415,7 @@
"throughput": 134.94827207337997
},
"gaudi3": {
"output": "DeepSpeed is a machine learning framework that accelerates training of large models on a single machine or distributed systems. It is designed to be compatible with PyTorch and TensorFlow, and can be used to train models on a single machine or on a distributed system.\n\nDeepSpeed is a machine learning framework that accelerates training of large models on a single machine or distributed systems. It is designed to be compatible with PyTorch and TensorFlow, and can be used to train models on a single machine or on a distributed system",
"output": "DeepSpeed is a machine learning framework that accelerates training and inference of deep learning models. It is designed to be flexible and easy to use, with a focus on performance and scalability. DeepSpeed is built on top of PyTorch, and it provides a set of tools and libraries that can be used to optimize the training and inference of deep learning models.\n\nDeepSpeed is designed to be used with a variety of hardware platforms, including GPUs, TPUs, and CPUs. It provides a",
"throughput": 160.48685620965531
}
},
Expand All @@ -425,7 +425,7 @@
"throughput": 71.29570003665306
},
"gaudi3": {
"output": "DeepSpeed is a machine learning framework that enables training of large models on a single machine with multiple GPUs. It is designed to be easy to use and efficient, and it supports a wide range of models and tasks.\n\nDeepSpeed is a deep learning framework that enables training of large models on a single machine with multiple GPUs. It is designed to be easy to use and efficient, and it supports a wide range of models and tasks.\n\nDeepSpeed is a deep learning framework that enables training of large models on a",
"output": "DeepSpeed is a machine learning framework that enables training of large models on a single machine with a single GPU. It is designed to be easy to use and efficient, and it can be used to train models on a variety of tasks.\n\nThe latest DeepSpeed for PC has come up with a few updates that are better than the previous version. Want to know those? Here are they:\n\n## DeepSpeed Andorid App Summary\n\nDeepSpeed has developed the DeepSpeed for Android. You can find it under the",
"throughput": 81.6817273229847
}
},
Expand Down
Loading