From d3f22f68b085bad42f749315774d615f7e2dd898 Mon Sep 17 00:00:00 2001 From: yuanwu Date: Fri, 8 Nov 2024 05:18:03 +0000 Subject: [PATCH 01/10] Enable Falcon-mamba Signed-off-by: yuanwu --- optimum/habana/transformers/modeling_utils.py | 5 +++++ optimum/habana/transformers/models/__init__.py | 3 +++ tests/test_text_generation_example.py | 1 + 3 files changed, 9 insertions(+) diff --git a/optimum/habana/transformers/modeling_utils.py b/optimum/habana/transformers/modeling_utils.py index 2fd24148be..fc04e3173e 100644 --- a/optimum/habana/transformers/modeling_utils.py +++ b/optimum/habana/transformers/modeling_utils.py @@ -155,6 +155,7 @@ gaudi_esm_for_protein_folding_forward, gaudi_esmfolding_trunk_forward, gaudi_falcon_linear_forward, + gaudi_FalconMambaForCausalLM_prepare_inputs_for_generation, gaudi_generate_speech, gaudi_get_extended_attention_mask, gaudi_gpt2_forward, @@ -593,6 +594,10 @@ def adapt_transformers_to_gaudi(): transformers.models.mamba.modeling_mamba.MambaForCausalLM._update_model_kwargs_for_generation = ( gaudi_MambaForCausalLM_update_model_kwargs_for_generation ) + transformers.models.falcon_mamba.modeling_falcon_mamba.FalconMambaForCausalLM.prepare_inputs_for_generation = ( + gaudi_FalconMambaForCausalLM_prepare_inputs_for_generation + ) + # Optimization for Whisper on Gaudi transformers.models.whisper.modeling_whisper.WhisperSdpaAttention = GaudiWhisperSdpaAttention diff --git a/optimum/habana/transformers/models/__init__.py b/optimum/habana/transformers/models/__init__.py index 8c9a045efa..bd03afe6b5 100644 --- a/optimum/habana/transformers/models/__init__.py +++ b/optimum/habana/transformers/models/__init__.py @@ -64,6 +64,9 @@ GaudiFalconModel, gaudi_falcon_linear_forward, ) +from .falcon_mamba import ( + gaudi_FalconMambaForCausalLM_prepare_inputs_for_generation, +) from .gemma import ( GaudiGemmaAttention, GaudiGemmaDecoderLayer, diff --git a/tests/test_text_generation_example.py b/tests/test_text_generation_example.py index 96d6043f36..e2741f166a 100644 --- a/tests/test_text_generation_example.py +++ b/tests/test_text_generation_example.py @@ -45,6 +45,7 @@ ("Qwen/Qwen2-7B", 512, False, 9669.45787), ("Qwen/Qwen1.5-MoE-A2.7B", 1, True, 44.25834541569395), ("EleutherAI/gpt-neo-2.7B", 1, False, 257.2476416844122), + ("tiiuae/falcon-mamba-7b", 1, False, 47.1464839567739), ], "fp8": [ ("tiiuae/falcon-180B", 4, 950, True, 128, 128, 2506.68), From db096b600e1d3266487639b4d7290161eff623ca Mon Sep 17 00:00:00 2001 From: yuanwu Date: Wed, 13 Nov 2024 06:06:02 +0000 Subject: [PATCH 02/10] Add the model files Signed-off-by: yuanwu --- .../models/falcon_mamba/__init__.py | 17 ++++ .../falcon_mamba/modeling_falcon_mamba.py | 78 +++++++++++++++++++ 2 files changed, 95 insertions(+) create mode 100644 optimum/habana/transformers/models/falcon_mamba/__init__.py create mode 100644 optimum/habana/transformers/models/falcon_mamba/modeling_falcon_mamba.py diff --git a/optimum/habana/transformers/models/falcon_mamba/__init__.py b/optimum/habana/transformers/models/falcon_mamba/__init__.py new file mode 100644 index 0000000000..fa95da28da --- /dev/null +++ b/optimum/habana/transformers/models/falcon_mamba/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .modeling_falcon_mamba import ( + gaudi_FalconMambaForCausalLM_prepare_inputs_for_generation +) \ No newline at end of file diff --git a/optimum/habana/transformers/models/falcon_mamba/modeling_falcon_mamba.py b/optimum/habana/transformers/models/falcon_mamba/modeling_falcon_mamba.py new file mode 100644 index 0000000000..fee5c369ad --- /dev/null +++ b/optimum/habana/transformers/models/falcon_mamba/modeling_falcon_mamba.py @@ -0,0 +1,78 @@ +# coding=utf-8 +# Copyright 2024 Tri Dao, Albert Gu, Technological Innovation Institute and HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch FALCONMAMBA model.""" +from typing import Optional + +import torch +from transformers.utils import ( + logging, +) + +from transformers.cache_utils import MambaCache + +logger = logging.get_logger(__name__) + +""" +Copys from https://github.com/huggingface/transformers/blob/53fad641cfdb5105e2470bcf3ef17ea8e25cc300/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py#L762 +The only differences are: +- Use the torch.index_select function to replace the slicing operation of Line 51 +""" +def gaudi_FalconMambaForCausalLM_prepare_inputs_for_generation( + self, + input_ids, + inputs_embeds=None, + use_cache=None, + cache_params: Optional[MambaCache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + **kwargs, +): + if use_cache: + # `cache_position` should have been initialized in `generate` + if cache_position is None: + raise ValueError( + "`cache_position` should not be None as it should have been initialized in " + "`model.generate`, you are responsible for passing in a valid `cache_position` if " + "you are calling `prepare_inputs_for_generation` directly with `use_cache=True`" + ) + if cache_position[0] > 0: + #input_ids = input_ids[:, -1].unsqueeze(-1) + idx = torch.tensor([input_ids.size(1) - 1]) + input_ids = torch.index_select(input_ids, 1, idx) + + if attention_mask is not None: + attention_mask = None + + else: + # we initialize the `cache_position` to full size of `conv_states` at prefill stage + # considering padding will be applied when input length is shorter, and truncation + # will be applied when it is longer, so it will be equivalent to always have it match + # the length of `cache_params.conv_states`, which is `config.conv_kernel` + cache_position = torch.arange(0, self.config.conv_kernel, device=input_ids.device) + + if inputs_embeds is not None and cache_params is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids.contiguous()} + + model_inputs.update( + { + "cache_params": cache_params, + "use_cache": use_cache, + "cache_position": cache_position, + "attention_mask": attention_mask, + } + ) + return model_inputs \ No newline at end of file From bdc99bc42da5bf5026b4117421988cf880ff12ab Mon Sep 17 00:00:00 2001 From: yuanwu Date: Wed, 13 Nov 2024 06:49:37 +0000 Subject: [PATCH 03/10] Fix errors of make style Signed-off-by: yuanwu --- optimum/habana/transformers/modeling_utils.py | 1 - .../habana/transformers/models/falcon_mamba/__init__.py | 4 +--- .../models/falcon_mamba/modeling_falcon_mamba.py | 9 ++++++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/optimum/habana/transformers/modeling_utils.py b/optimum/habana/transformers/modeling_utils.py index fc04e3173e..c2be5069b4 100644 --- a/optimum/habana/transformers/modeling_utils.py +++ b/optimum/habana/transformers/modeling_utils.py @@ -598,7 +598,6 @@ def adapt_transformers_to_gaudi(): gaudi_FalconMambaForCausalLM_prepare_inputs_for_generation ) - # Optimization for Whisper on Gaudi transformers.models.whisper.modeling_whisper.WhisperSdpaAttention = GaudiWhisperSdpaAttention transformers.models.whisper.modeling_whisper.WhisperDecoderLayer = GaudiWhisperDecoderLayer diff --git a/optimum/habana/transformers/models/falcon_mamba/__init__.py b/optimum/habana/transformers/models/falcon_mamba/__init__.py index fa95da28da..8d99cf5162 100644 --- a/optimum/habana/transformers/models/falcon_mamba/__init__.py +++ b/optimum/habana/transformers/models/falcon_mamba/__init__.py @@ -12,6 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .modeling_falcon_mamba import ( - gaudi_FalconMambaForCausalLM_prepare_inputs_for_generation -) \ No newline at end of file +from .modeling_falcon_mamba import gaudi_FalconMambaForCausalLM_prepare_inputs_for_generation diff --git a/optimum/habana/transformers/models/falcon_mamba/modeling_falcon_mamba.py b/optimum/habana/transformers/models/falcon_mamba/modeling_falcon_mamba.py index fee5c369ad..8268504171 100644 --- a/optimum/habana/transformers/models/falcon_mamba/modeling_falcon_mamba.py +++ b/optimum/habana/transformers/models/falcon_mamba/modeling_falcon_mamba.py @@ -13,14 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. """PyTorch FALCONMAMBA model.""" + from typing import Optional import torch +from transformers.cache_utils import MambaCache from transformers.utils import ( logging, ) -from transformers.cache_utils import MambaCache logger = logging.get_logger(__name__) @@ -29,6 +30,8 @@ The only differences are: - Use the torch.index_select function to replace the slicing operation of Line 51 """ + + def gaudi_FalconMambaForCausalLM_prepare_inputs_for_generation( self, input_ids, @@ -48,7 +51,7 @@ def gaudi_FalconMambaForCausalLM_prepare_inputs_for_generation( "you are calling `prepare_inputs_for_generation` directly with `use_cache=True`" ) if cache_position[0] > 0: - #input_ids = input_ids[:, -1].unsqueeze(-1) + # input_ids = input_ids[:, -1].unsqueeze(-1) idx = torch.tensor([input_ids.size(1) - 1]) input_ids = torch.index_select(input_ids, 1, idx) @@ -75,4 +78,4 @@ def gaudi_FalconMambaForCausalLM_prepare_inputs_for_generation( "attention_mask": attention_mask, } ) - return model_inputs \ No newline at end of file + return model_inputs From 70c22d98309e02cdcaff52f3807dcaeaa3c060c4 Mon Sep 17 00:00:00 2001 From: yuanwu Date: Sun, 17 Nov 2024 21:45:55 +0000 Subject: [PATCH 04/10] Reduce the graph compiling time. Signed-off-by: yuanwu --- examples/text-generation/run_generation.py | 2 - .../habana/transformers/generation/utils.py | 1 - optimum/habana/transformers/modeling_utils.py | 10 ++ .../habana/transformers/models/__init__.py | 1 + .../models/falcon_mamba/__init__.py | 1 + .../falcon_mamba/modeling_falcon_mamba.py | 116 +++++++++++++++++- 6 files changed, 122 insertions(+), 9 deletions(-) diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py index 74e4f600d7..341caccd2a 100755 --- a/examples/text-generation/run_generation.py +++ b/examples/text-generation/run_generation.py @@ -503,8 +503,6 @@ def compute_valid_sequence_lengths_tensor(input_tokens): iteration_times=iteration_times, profiling_record_shapes=args.profiling_record_shapes, ).cpu() - first_token_time = iteration_times[0] + encode_duration - logger.info(f"Time to first token = {first_token_time*1000}ms") return tokenizer.batch_decode(outputs, skip_special_tokens=True) from optimum.habana.utils import HabanaProfile diff --git a/optimum/habana/transformers/generation/utils.py b/optimum/habana/transformers/generation/utils.py index f088974f3f..e953a93814 100644 --- a/optimum/habana/transformers/generation/utils.py +++ b/optimum/habana/transformers/generation/utils.py @@ -2453,7 +2453,6 @@ def _sample( else: # case3 (default case): token_idx is None next_token_scores = logits_processor(input_ids, next_token_logits) - # Store scores, attentions and hidden_states when required if return_dict_in_generate: if output_scores: diff --git a/optimum/habana/transformers/modeling_utils.py b/optimum/habana/transformers/modeling_utils.py index 0c68696b80..e5493acf1b 100644 --- a/optimum/habana/transformers/modeling_utils.py +++ b/optimum/habana/transformers/modeling_utils.py @@ -173,6 +173,7 @@ gaudi_esmfolding_trunk_forward, gaudi_falcon_linear_forward, gaudi_FalconMambaForCausalLM_prepare_inputs_for_generation, + gaudi_FalconMambaModel_forward, gaudi_generate_speech, gaudi_get_extended_attention_mask, gaudi_gpt2_forward, @@ -625,6 +626,15 @@ def adapt_transformers_to_gaudi(): transformers.models.falcon_mamba.modeling_falcon_mamba.FalconMambaForCausalLM.prepare_inputs_for_generation = ( gaudi_FalconMambaForCausalLM_prepare_inputs_for_generation ) + transformers.models.falcon_mamba.modeling_falcon_mamba.FalconMambaModel.forward = ( + gaudi_FalconMambaModel_forward + ) + transformers.models.falcon_mamba.modeling_falcon_mamba.FalconMambaRMSNorm.forward = ( + gaudi_llama_rmsnorm_forward + ) + + + # Optimization for Whisper on Gaudi transformers.models.whisper.modeling_whisper.WhisperSdpaAttention = GaudiWhisperSdpaAttention diff --git a/optimum/habana/transformers/models/__init__.py b/optimum/habana/transformers/models/__init__.py index e35e7c112e..eaf80b199c 100644 --- a/optimum/habana/transformers/models/__init__.py +++ b/optimum/habana/transformers/models/__init__.py @@ -72,6 +72,7 @@ ) from .falcon_mamba import ( gaudi_FalconMambaForCausalLM_prepare_inputs_for_generation, + gaudi_FalconMambaModel_forward, ) from .gemma import ( GaudiGemmaAttention, diff --git a/optimum/habana/transformers/models/falcon_mamba/__init__.py b/optimum/habana/transformers/models/falcon_mamba/__init__.py index 8d99cf5162..05dd539c9f 100644 --- a/optimum/habana/transformers/models/falcon_mamba/__init__.py +++ b/optimum/habana/transformers/models/falcon_mamba/__init__.py @@ -13,3 +13,4 @@ # limitations under the License. from .modeling_falcon_mamba import gaudi_FalconMambaForCausalLM_prepare_inputs_for_generation +from .modeling_falcon_mamba import gaudi_FalconMambaModel_forward diff --git a/optimum/habana/transformers/models/falcon_mamba/modeling_falcon_mamba.py b/optimum/habana/transformers/models/falcon_mamba/modeling_falcon_mamba.py index 8268504171..609d19583b 100644 --- a/optimum/habana/transformers/models/falcon_mamba/modeling_falcon_mamba.py +++ b/optimum/habana/transformers/models/falcon_mamba/modeling_falcon_mamba.py @@ -13,25 +13,129 @@ # See the License for the specific language governing permissions and # limitations under the License. """PyTorch FALCONMAMBA model.""" - -from typing import Optional - +import os import torch +from typing import Any, Dict, Optional, Tuple, Union from transformers.cache_utils import MambaCache from transformers.utils import ( logging, ) +from transformers.models.falcon_mamba.modeling_falcon_mamba import FalconMambaOutput +from transformers.utils.import_utils import is_causal_conv1d_available, is_mamba_ssm_available, is_mambapy_available +import habana_frameworks.torch.core as htcore + + +# if is_mambapy_available(): +# from mambapy.pscan import pscan +# else: +# pscan = None +# if is_mamba_ssm_available(): +# from mamba_ssm.ops.selective_scan_interface import selective_scan_fn +# from mamba_ssm.ops.triton.selective_state_update import selective_state_update +# from ...kernels.falcon_mamba import mamba_inner_fn +# else: +# selective_state_update, selective_scan_fn, mamba_inner_fn = None, None, None + +# if is_causal_conv1d_available(): +# from causal_conv1d import causal_conv1d_fn, causal_conv1d_update +# else: +# causal_conv1d_update, causal_conv1d_fn = None, None + +# is_fast_path_available = all( +# (selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn) +# ) logger = logging.get_logger(__name__) """ -Copys from https://github.com/huggingface/transformers/blob/53fad641cfdb5105e2470bcf3ef17ea8e25cc300/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py#L762 +Copys from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py#L635 The only differences are: -- Use the torch.index_select function to replace the slicing operation of Line 51 +- Use the mark_step function to reduce the graph compiling time. """ +def gaudi_FalconMambaModel_forward( + self, + input_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.LongTensor] = None, + cache_params: Optional[MambaCache] = None, + use_cache: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + lazy_mode: Optional[bool] = True, +) -> Union[Tuple, FalconMambaOutput]: + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if (input_ids is None) ^ (inputs_embeds is not None): # ^ is python for xor + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one" + ) + if inputs_embeds is None: + inputs_embeds = self.embeddings(input_ids) + + if self.gradient_checkpointing and self.training and use_cache: + use_cache = False + + if use_cache: + if cache_params is None: + cache_params = MambaCache( + self.config, inputs_embeds.size(0), device=inputs_embeds.device, dtype=inputs_embeds.dtype + ) + cache_position = torch.arange(0, self.config.conv_kernel, device=inputs_embeds.device) + elif cache_position is None: + # cases when we do manual forward instead of using `model.generate` which will initiate + # `cache_position` and makes sure it is not None, throw error here instead of doing some + # hack to conjecture the current cache position + raise ValueError( + "You have to specify the `cache_position` manually when `use_cache=True` and `cache_params` is passed, " + "you don't have to pass a `cache_params` if you are in prefilling stage because in that case it will " + "be initialized for you automatically" + ) + else: + cache_params = None + hidden_states = inputs_embeds + all_hidden_states = () if output_hidden_states else None + for mixer_block in self.layers: + if lazy_mode: + htcore.mark_step() + if self.gradient_checkpointing and self.training: + hidden_states = self._gradient_checkpointing_func( + mixer_block.__call__, hidden_states, cache_params, cache_position, attention_mask + ) + else: + hidden_states = mixer_block( + hidden_states, + cache_params=cache_params, + cache_position=cache_position, + attention_mask=attention_mask, + ) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + hidden_states = self.norm_f(hidden_states) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, cache_params, all_hidden_states] if v is not None) + + return FalconMambaOutput( + last_hidden_state=hidden_states, + cache_params=cache_params if use_cache else None, + hidden_states=all_hidden_states, + ) + +""" +Copys from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py#L762 +The only differences are: +- Use the torch.index_select function to replace the slicing operation of Line 51 +""" def gaudi_FalconMambaForCausalLM_prepare_inputs_for_generation( self, input_ids, @@ -52,7 +156,7 @@ def gaudi_FalconMambaForCausalLM_prepare_inputs_for_generation( ) if cache_position[0] > 0: # input_ids = input_ids[:, -1].unsqueeze(-1) - idx = torch.tensor([input_ids.size(1) - 1]) + idx = torch.tensor([input_ids.size(1) - 1], device=input_ids.device) input_ids = torch.index_select(input_ids, 1, idx) if attention_mask is not None: From 942ba3249f4b48724ce1ac0c3f2a84c3f29236ef Mon Sep 17 00:00:00 2001 From: yuanwu Date: Mon, 18 Nov 2024 01:35:34 +0000 Subject: [PATCH 05/10] Fix make style Signed-off-by: yuanwu --- examples/text-generation/run_generation.py | 2 ++ optimum/habana/transformers/modeling_utils.py | 11 ++--------- .../transformers/models/falcon_mamba/__init__.py | 6 ++++-- .../models/falcon_mamba/modeling_falcon_mamba.py | 15 ++++++++++----- 4 files changed, 18 insertions(+), 16 deletions(-) diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py index 341caccd2a..74e4f600d7 100755 --- a/examples/text-generation/run_generation.py +++ b/examples/text-generation/run_generation.py @@ -503,6 +503,8 @@ def compute_valid_sequence_lengths_tensor(input_tokens): iteration_times=iteration_times, profiling_record_shapes=args.profiling_record_shapes, ).cpu() + first_token_time = iteration_times[0] + encode_duration + logger.info(f"Time to first token = {first_token_time*1000}ms") return tokenizer.batch_decode(outputs, skip_special_tokens=True) from optimum.habana.utils import HabanaProfile diff --git a/optimum/habana/transformers/modeling_utils.py b/optimum/habana/transformers/modeling_utils.py index e5493acf1b..76e46eddb1 100644 --- a/optimum/habana/transformers/modeling_utils.py +++ b/optimum/habana/transformers/modeling_utils.py @@ -626,15 +626,8 @@ def adapt_transformers_to_gaudi(): transformers.models.falcon_mamba.modeling_falcon_mamba.FalconMambaForCausalLM.prepare_inputs_for_generation = ( gaudi_FalconMambaForCausalLM_prepare_inputs_for_generation ) - transformers.models.falcon_mamba.modeling_falcon_mamba.FalconMambaModel.forward = ( - gaudi_FalconMambaModel_forward - ) - transformers.models.falcon_mamba.modeling_falcon_mamba.FalconMambaRMSNorm.forward = ( - gaudi_llama_rmsnorm_forward - ) - - - + transformers.models.falcon_mamba.modeling_falcon_mamba.FalconMambaModel.forward = gaudi_FalconMambaModel_forward + transformers.models.falcon_mamba.modeling_falcon_mamba.FalconMambaRMSNorm.forward = gaudi_llama_rmsnorm_forward # Optimization for Whisper on Gaudi transformers.models.whisper.modeling_whisper.WhisperSdpaAttention = GaudiWhisperSdpaAttention diff --git a/optimum/habana/transformers/models/falcon_mamba/__init__.py b/optimum/habana/transformers/models/falcon_mamba/__init__.py index 05dd539c9f..efbcaaa072 100644 --- a/optimum/habana/transformers/models/falcon_mamba/__init__.py +++ b/optimum/habana/transformers/models/falcon_mamba/__init__.py @@ -12,5 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .modeling_falcon_mamba import gaudi_FalconMambaForCausalLM_prepare_inputs_for_generation -from .modeling_falcon_mamba import gaudi_FalconMambaModel_forward +from .modeling_falcon_mamba import ( + gaudi_FalconMambaForCausalLM_prepare_inputs_for_generation, + gaudi_FalconMambaModel_forward, +) diff --git a/optimum/habana/transformers/models/falcon_mamba/modeling_falcon_mamba.py b/optimum/habana/transformers/models/falcon_mamba/modeling_falcon_mamba.py index 609d19583b..9a45219bfd 100644 --- a/optimum/habana/transformers/models/falcon_mamba/modeling_falcon_mamba.py +++ b/optimum/habana/transformers/models/falcon_mamba/modeling_falcon_mamba.py @@ -13,16 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. """PyTorch FALCONMAMBA model.""" -import os + +from typing import Optional, Tuple, Union + +import habana_frameworks.torch.core as htcore import torch -from typing import Any, Dict, Optional, Tuple, Union from transformers.cache_utils import MambaCache +from transformers.models.falcon_mamba.modeling_falcon_mamba import FalconMambaOutput from transformers.utils import ( logging, ) -from transformers.models.falcon_mamba.modeling_falcon_mamba import FalconMambaOutput -from transformers.utils.import_utils import is_causal_conv1d_available, is_mamba_ssm_available, is_mambapy_available -import habana_frameworks.torch.core as htcore # if is_mambapy_available(): @@ -53,6 +53,8 @@ The only differences are: - Use the mark_step function to reduce the graph compiling time. """ + + def gaudi_FalconMambaModel_forward( self, input_ids: Optional[torch.LongTensor] = None, @@ -131,11 +133,14 @@ def gaudi_FalconMambaModel_forward( hidden_states=all_hidden_states, ) + """ Copys from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/falcon_mamba/modeling_falcon_mamba.py#L762 The only differences are: - Use the torch.index_select function to replace the slicing operation of Line 51 """ + + def gaudi_FalconMambaForCausalLM_prepare_inputs_for_generation( self, input_ids, From beff5fcc893ed9355ea489cc75f3c5f6bcc531e1 Mon Sep 17 00:00:00 2001 From: yuanwu Date: Tue, 19 Nov 2024 02:13:49 +0000 Subject: [PATCH 06/10] Remove useless code Signed-off-by: yuanwu --- .../falcon_mamba/modeling_falcon_mamba.py | 22 ------------------- 1 file changed, 22 deletions(-) diff --git a/optimum/habana/transformers/models/falcon_mamba/modeling_falcon_mamba.py b/optimum/habana/transformers/models/falcon_mamba/modeling_falcon_mamba.py index 9a45219bfd..77042a134f 100644 --- a/optimum/habana/transformers/models/falcon_mamba/modeling_falcon_mamba.py +++ b/optimum/habana/transformers/models/falcon_mamba/modeling_falcon_mamba.py @@ -24,28 +24,6 @@ logging, ) - -# if is_mambapy_available(): -# from mambapy.pscan import pscan -# else: -# pscan = None - -# if is_mamba_ssm_available(): -# from mamba_ssm.ops.selective_scan_interface import selective_scan_fn -# from mamba_ssm.ops.triton.selective_state_update import selective_state_update - -# from ...kernels.falcon_mamba import mamba_inner_fn -# else: -# selective_state_update, selective_scan_fn, mamba_inner_fn = None, None, None - -# if is_causal_conv1d_available(): -# from causal_conv1d import causal_conv1d_fn, causal_conv1d_update -# else: -# causal_conv1d_update, causal_conv1d_fn = None, None - -# is_fast_path_available = all( -# (selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn) -# ) logger = logging.get_logger(__name__) """ From 25c7e1348cdf04f10ea8343a64ea9f7afbe1b106 Mon Sep 17 00:00:00 2001 From: yuanwu Date: Tue, 19 Nov 2024 02:14:45 +0000 Subject: [PATCH 07/10] Fix error of make style Signed-off-by: yuanwu --- .../transformers/models/falcon_mamba/modeling_falcon_mamba.py | 1 + 1 file changed, 1 insertion(+) diff --git a/optimum/habana/transformers/models/falcon_mamba/modeling_falcon_mamba.py b/optimum/habana/transformers/models/falcon_mamba/modeling_falcon_mamba.py index 77042a134f..5d618fac91 100644 --- a/optimum/habana/transformers/models/falcon_mamba/modeling_falcon_mamba.py +++ b/optimum/habana/transformers/models/falcon_mamba/modeling_falcon_mamba.py @@ -24,6 +24,7 @@ logging, ) + logger = logging.get_logger(__name__) """ From c800aaab0c33dd731bfd3a6ed5e3b821a6017135 Mon Sep 17 00:00:00 2001 From: yuanwu Date: Tue, 26 Nov 2024 07:06:49 +0000 Subject: [PATCH 08/10] Remove the useless modification Signed-off-by: yuanwu --- optimum/habana/transformers/generation/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/optimum/habana/transformers/generation/utils.py b/optimum/habana/transformers/generation/utils.py index 00e64d7ce0..012fcccfd1 100644 --- a/optimum/habana/transformers/generation/utils.py +++ b/optimum/habana/transformers/generation/utils.py @@ -2462,6 +2462,7 @@ def _sample( else: # case3 (default case): token_idx is None next_token_scores = logits_processor(input_ids, next_token_logits) + # Store scores, attentions and hidden_states when required if return_dict_in_generate: if output_scores: From 351d2646564131d4121da247edddb8338874ec97 Mon Sep 17 00:00:00 2001 From: yuanwu Date: Tue, 26 Nov 2024 07:08:47 +0000 Subject: [PATCH 09/10] Remove empty lines Signed-off-by: yuanwu --- .../models/falcon_mamba/modeling_falcon_mamba.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/optimum/habana/transformers/models/falcon_mamba/modeling_falcon_mamba.py b/optimum/habana/transformers/models/falcon_mamba/modeling_falcon_mamba.py index 5d618fac91..0e0e6a6316 100644 --- a/optimum/habana/transformers/models/falcon_mamba/modeling_falcon_mamba.py +++ b/optimum/habana/transformers/models/falcon_mamba/modeling_falcon_mamba.py @@ -32,8 +32,6 @@ The only differences are: - Use the mark_step function to reduce the graph compiling time. """ - - def gaudi_FalconMambaModel_forward( self, input_ids: Optional[torch.LongTensor] = None, @@ -118,8 +116,6 @@ def gaudi_FalconMambaModel_forward( The only differences are: - Use the torch.index_select function to replace the slicing operation of Line 51 """ - - def gaudi_FalconMambaForCausalLM_prepare_inputs_for_generation( self, input_ids, @@ -166,4 +162,4 @@ def gaudi_FalconMambaForCausalLM_prepare_inputs_for_generation( "attention_mask": attention_mask, } ) - return model_inputs + return model_inputs \ No newline at end of file From d42bbec0a4e3df465449721f48bc0b3f67f86465 Mon Sep 17 00:00:00 2001 From: yuanwu Date: Tue, 26 Nov 2024 07:10:05 +0000 Subject: [PATCH 10/10] Fixed the errors of make style Signed-off-by: yuanwu --- .../models/falcon_mamba/modeling_falcon_mamba.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/optimum/habana/transformers/models/falcon_mamba/modeling_falcon_mamba.py b/optimum/habana/transformers/models/falcon_mamba/modeling_falcon_mamba.py index 0e0e6a6316..5d618fac91 100644 --- a/optimum/habana/transformers/models/falcon_mamba/modeling_falcon_mamba.py +++ b/optimum/habana/transformers/models/falcon_mamba/modeling_falcon_mamba.py @@ -32,6 +32,8 @@ The only differences are: - Use the mark_step function to reduce the graph compiling time. """ + + def gaudi_FalconMambaModel_forward( self, input_ids: Optional[torch.LongTensor] = None, @@ -116,6 +118,8 @@ def gaudi_FalconMambaModel_forward( The only differences are: - Use the torch.index_select function to replace the slicing operation of Line 51 """ + + def gaudi_FalconMambaForCausalLM_prepare_inputs_for_generation( self, input_ids, @@ -162,4 +166,4 @@ def gaudi_FalconMambaForCausalLM_prepare_inputs_for_generation( "attention_mask": attention_mask, } ) - return model_inputs \ No newline at end of file + return model_inputs