diff --git a/Makefile b/Makefile index 1a29eece9e..2f0bb7ea2f 100644 --- a/Makefile +++ b/Makefile @@ -103,6 +103,7 @@ doc: build_doc_docker_image clean: find . -name "habana_log.livealloc.log_*" -type f -delete + find . -name "hl-smi_log*" -type f -delete find . -name .lock -type f -delete find . -name .graph_dumps -type d -exec rm -r {} + find . -name save-hpu.pdb -type f -delete diff --git a/examples/audio-classification/run_audio_classification.py b/examples/audio-classification/run_audio_classification.py index 5571fad7ce..27e18cbed7 100644 --- a/examples/audio-classification/run_audio_classification.py +++ b/examples/audio-classification/run_audio_classification.py @@ -47,7 +47,7 @@ def check_optimum_habana_min_version(*a, **b): logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. -check_min_version("4.37.0") +check_min_version("4.38.0") check_optimum_habana_min_version("1.10.0") require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt") diff --git a/examples/contrastive-image-text/clip_media_pipe.py b/examples/contrastive-image-text/clip_media_pipe.py index 62c2a5651b..48811d4c08 100644 --- a/examples/contrastive-image-text/clip_media_pipe.py +++ b/examples/contrastive-image-text/clip_media_pipe.py @@ -16,6 +16,7 @@ import numpy as np from torch.utils.data.sampler import BatchSampler +from optimum.habana.utils import check_habana_frameworks_version from optimum.utils import logging @@ -128,7 +129,10 @@ def __next__(self): read_image_text_from_dataset, dtype.NDT, ) -op_class = fn.operator_add("ClipDataReader") +if check_habana_frameworks_version("1.14.0"): + op_class = fn.operator_add("ClipDataReader") +else: + op_class = fn.operator_add("ClipDataReader", False) op_class.__module__ = fn.__name__ setattr(fn, "ClipDataReader", op_class) diff --git a/examples/contrastive-image-text/run_bridgetower.py b/examples/contrastive-image-text/run_bridgetower.py index ee29dd74ad..8ae5caeffd 100644 --- a/examples/contrastive-image-text/run_bridgetower.py +++ b/examples/contrastive-image-text/run_bridgetower.py @@ -57,7 +57,7 @@ def check_optimum_habana_min_version(*a, **b): logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. -check_min_version("4.37.0") +check_min_version("4.38.0") check_optimum_habana_min_version("1.10.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt") diff --git a/examples/contrastive-image-text/run_clip.py b/examples/contrastive-image-text/run_clip.py index a2b325573c..020efebd6b 100644 --- a/examples/contrastive-image-text/run_clip.py +++ b/examples/contrastive-image-text/run_clip.py @@ -62,7 +62,7 @@ def check_optimum_habana_min_version(*a, **b): logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. -check_min_version("4.37.0") +check_min_version("4.38.0") check_optimum_habana_min_version("1.10.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt") diff --git a/examples/image-classification/run_image_classification.py b/examples/image-classification/run_image_classification.py index 2eb6ab5caa..8006c2ec5f 100644 --- a/examples/image-classification/run_image_classification.py +++ b/examples/image-classification/run_image_classification.py @@ -64,7 +64,7 @@ def check_optimum_habana_min_version(*a, **b): logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. -check_min_version("4.37.0") +check_min_version("4.38.0") check_optimum_habana_min_version("1.10.0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt") diff --git a/examples/language-modeling/run_clm.py b/examples/language-modeling/run_clm.py index 4e7b439659..ae77be2e1c 100644 --- a/examples/language-modeling/run_clm.py +++ b/examples/language-modeling/run_clm.py @@ -63,7 +63,7 @@ def check_optimum_habana_min_version(*a, **b): logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. -check_min_version("4.37.0") +check_min_version("4.38.0") check_optimum_habana_min_version("1.10.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/language-modeling/run_mlm.py b/examples/language-modeling/run_mlm.py index 8f75320464..b2e722609f 100644 --- a/examples/language-modeling/run_mlm.py +++ b/examples/language-modeling/run_mlm.py @@ -61,7 +61,7 @@ def check_optimum_habana_min_version(*a, **b): logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. -check_min_version("4.37.0") +check_min_version("4.38.0") check_optimum_habana_min_version("1.10.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/question-answering/run_qa.py b/examples/question-answering/run_qa.py index 5ac8b2d9de..a72cc68aec 100644 --- a/examples/question-answering/run_qa.py +++ b/examples/question-answering/run_qa.py @@ -60,7 +60,7 @@ def check_optimum_habana_min_version(*a, **b): logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. -check_min_version("4.37.0") +check_min_version("4.38.0") check_optimum_habana_min_version("1.10.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/question-answering/run_seq2seq_qa.py b/examples/question-answering/run_seq2seq_qa.py index 539f26a330..87993a4dc7 100644 --- a/examples/question-answering/run_seq2seq_qa.py +++ b/examples/question-answering/run_seq2seq_qa.py @@ -57,7 +57,7 @@ def check_optimum_habana_min_version(*a, **b): logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. -check_min_version("4.37.0") +check_min_version("4.38.0") check_optimum_habana_min_version("1.10.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/speech-recognition/run_speech_recognition_ctc.py b/examples/speech-recognition/run_speech_recognition_ctc.py index 7bfc296ba5..a52ab26886 100644 --- a/examples/speech-recognition/run_speech_recognition_ctc.py +++ b/examples/speech-recognition/run_speech_recognition_ctc.py @@ -60,7 +60,7 @@ def check_optimum_habana_min_version(*a, **b): logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. -check_min_version("4.37.0") +check_min_version("4.38.0") check_optimum_habana_min_version("1.10.0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/speech-recognition/run_speech_recognition_seq2seq.py b/examples/speech-recognition/run_speech_recognition_seq2seq.py index 081b29a205..67af5a4e48 100755 --- a/examples/speech-recognition/run_speech_recognition_seq2seq.py +++ b/examples/speech-recognition/run_speech_recognition_seq2seq.py @@ -56,7 +56,7 @@ def check_optimum_habana_min_version(*a, **b): # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.37.0") +check_min_version("4.38.0") check_optimum_habana_min_version("1.10.0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") @@ -469,6 +469,9 @@ def main(): if data_args.language is not None: # We only need to set the task id when the language is specified (i.e. in a multilingual setting) tokenizer.set_prefix_tokens(language=data_args.language, task=data_args.task) + model.generation_config.task = data_args.task + model.generation_config.language = data_args.language + model.generation_config.forced_decoder_ids = None # 6. Resample speech dataset if necessary dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate diff --git a/examples/summarization/run_summarization.py b/examples/summarization/run_summarization.py index 9040a4b1f0..2517f7f4be 100644 --- a/examples/summarization/run_summarization.py +++ b/examples/summarization/run_summarization.py @@ -66,7 +66,7 @@ def check_optimum_habana_min_version(*a, **b): logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. -check_min_version("4.37.0") +check_min_version("4.38.0") check_optimum_habana_min_version("1.10.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/text-classification/run_glue.py b/examples/text-classification/run_glue.py index 0745f075c0..af3c8c7371 100755 --- a/examples/text-classification/run_glue.py +++ b/examples/text-classification/run_glue.py @@ -58,7 +58,7 @@ def check_optimum_habana_min_version(*a, **b): logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. -check_min_version("4.37.0") +check_min_version("4.38.0") check_optimum_habana_min_version("1.10.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/translation/run_translation.py b/examples/translation/run_translation.py index 4c0ec070b9..1d35068c72 100644 --- a/examples/translation/run_translation.py +++ b/examples/translation/run_translation.py @@ -63,7 +63,7 @@ def check_optimum_habana_min_version(*a, **b): logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. -check_min_version("4.37.0") +check_min_version("4.38.0") check_optimum_habana_min_version("1.10.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") diff --git a/optimum/habana/transformers/generation/__init__.py b/optimum/habana/transformers/generation/__init__.py index 5b64394cba..15f567b0be 100644 --- a/optimum/habana/transformers/generation/__init__.py +++ b/optimum/habana/transformers/generation/__init__.py @@ -2,6 +2,5 @@ from .stopping_criteria import ( gaudi_MaxLengthCriteria_call, gaudi_MaxNewTokensCriteria_call, - gaudi_StoppingCriteriaList_call, ) from .utils import MODELS_OPTIMIZED_WITH_STATIC_SHAPES, GaudiGenerationMixin diff --git a/optimum/habana/transformers/generation/stopping_criteria.py b/optimum/habana/transformers/generation/stopping_criteria.py index a90a9372e3..4c6eedae61 100644 --- a/optimum/habana/transformers/generation/stopping_criteria.py +++ b/optimum/habana/transformers/generation/stopping_criteria.py @@ -44,7 +44,3 @@ def gaudi_MaxNewTokensCriteria_call(self, input_ids: torch.LongTensor, scores: t return token_idx >= self.max_length else: return input_ids.shape[-1] >= self.max_length - - -def gaudi_StoppingCriteriaList_call(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: - return any(criteria(input_ids, scores, **kwargs) for criteria in self) diff --git a/optimum/habana/transformers/generation/utils.py b/optimum/habana/transformers/generation/utils.py index 42da95552e..3c79763bc8 100755 --- a/optimum/habana/transformers/generation/utils.py +++ b/optimum/habana/transformers/generation/utils.py @@ -31,6 +31,7 @@ validate_stopping_criteria, ) from transformers.generation.utils import ( + NEED_SETUP_CACHE_CLASSES_MAPPING, GenerateBeamDecoderOnlyOutput, GenerateBeamEncoderDecoderOutput, GenerateBeamOutput, @@ -40,6 +41,8 @@ GenerateOutput, GenerationMixin, GenerationMode, + _split_model_inputs, + stack_model_outputs, ) from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled from transformers.utils import ModelOutput @@ -158,71 +161,6 @@ def _get_hpu_graphs_kwargs(self, model_kwargs): hpu_graphs_kwargs.update({"bypass_hpu_graphs": True}) return hpu_graphs_kwargs - def _update_model_kwargs_for_generation( - self, - outputs: ModelOutput, - model_kwargs: Dict[str, Any], - is_encoder_decoder: bool = False, - standardize_cache_format: bool = False, - ) -> Dict[str, Any]: - """ - Copied from Transformers: https://github.com/huggingface/transformers/blob/527ab894e59b6582578008e3b47648a65063f73d/src/transformers/generation/utils.py#L745 - - Adds support for `token_idx`, which is necessary for using static shapes. - """ - # mark to identify starting from second token - model_kwargs["first_token"] = False - # update past_key_values - model_kwargs["past_key_values"] = self._extract_past_from_model_output( - outputs, standardize_cache_format=standardize_cache_format - ) - if getattr(outputs, "state", None) is not None: - model_kwargs["state"] = outputs.state - - # update token_type_ids with last value - if "token_type_ids" in model_kwargs: - token_type_ids = model_kwargs["token_type_ids"] - model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1) - - token_idx = model_kwargs.get("token_idx", None) - - if not is_encoder_decoder: - # update attention mask - if "attention_mask" in model_kwargs: - attention_mask = model_kwargs["attention_mask"] - if token_idx is not None: - attention_mask.index_fill_(1, token_idx, 1) - else: - attention_mask = torch.cat( - [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 - ) - model_kwargs["attention_mask"] = attention_mask - else: - # update decoder attention mask - if "attention_mask" in model_kwargs: - attention_mask = model_kwargs["attention_mask"] - if token_idx is not None: - attention_mask.index_fill_(1, token_idx, 1) - model_kwargs["attention_mask"] = attention_mask - if "decoder_attention_mask" in model_kwargs: - decoder_attention_mask = model_kwargs["decoder_attention_mask"] - if token_idx is not None: - decoder_attention_mask.index_fill_(1, token_idx, 1) - else: - decoder_attention_mask = torch.cat( - [ - decoder_attention_mask, - decoder_attention_mask.new_ones((decoder_attention_mask.shape[0], 1)), - ], - dim=-1, - ) - model_kwargs["decoder_attention_mask"] = decoder_attention_mask - - if token_idx is not None: - token_idx.add_(1) - - return model_kwargs - def _prepare_decoder_attention_mask( self, max_steps: int, # current stopping criteria @@ -239,7 +177,7 @@ def _prepare_decoder_input_ids_for_generation( batch_size: int, model_input_name: str, model_kwargs: Dict[str, torch.Tensor], - decoder_start_token_id: int = None, + decoder_start_token_id: Union[int, List[int]] = None, bos_token_id: int = None, device: torch.device = None, max_new_tokens: int = None, @@ -262,9 +200,17 @@ def _prepare_decoder_input_ids_for_generation( if device is None: device = self.device if token_idx is None: - decoder_input_ids_start = ( - torch.ones((batch_size, 1), dtype=torch.long, device=device) * decoder_start_token_id - ) + if isinstance(decoder_start_token_id, list): + if len(decoder_start_token_id) != batch_size: + raise ValueError( + f"`decoder_start_token_id` expcted to have length {batch_size} but got {len(decoder_start_token_id)}" + ) + decoder_input_ids_start = torch.tensor(decoder_start_token_id, dtype=torch.long, device=device) + decoder_input_ids_start = decoder_input_ids_start.view(-1, 1) + else: + decoder_input_ids_start = ( + torch.ones((batch_size, 1), dtype=torch.long, device=device) * decoder_start_token_id + ) else: # creating padded decoder_input_ids to achieve static shapes. Later new tokens once generated are copied in to decoder_input_ids based on token_idx max_length = max_new_tokens + 1 if max_new_tokens is not None else self.generation_config.max_length @@ -282,7 +228,13 @@ def _prepare_decoder_input_ids_for_generation( pass # user input but doesn't start with decoder_start_token_id -> prepend decoder_start_token_id (and adjust # decoder_attention_mask if provided) - elif (decoder_input_ids[:, 0] != decoder_start_token_id).all().item(): + elif ( + isinstance(decoder_start_token_id, int) + and (decoder_input_ids[:, 0] != decoder_start_token_id).all().item() + ) or ( + isinstance(decoder_start_token_id, torch.Tensor) + and (decoder_input_ids[:, 0] != decoder_start_token_id[:, 0]).all().item() + ): decoder_input_ids = torch.cat([decoder_input_ids_start, decoder_input_ids], dim=-1) if "decoder_attention_mask" in model_kwargs: decoder_attention_mask = model_kwargs["decoder_attention_mask"] @@ -293,6 +245,71 @@ def _prepare_decoder_input_ids_for_generation( model_kwargs["decoder_attention_mask"] = decoder_attention_mask return decoder_input_ids, model_kwargs + def _update_model_kwargs_for_generation( + self, + outputs: ModelOutput, + model_kwargs: Dict[str, Any], + is_encoder_decoder: bool = False, + standardize_cache_format: bool = False, + ) -> Dict[str, Any]: + """ + Copied from Transformers: https://github.com/huggingface/transformers/blob/527ab894e59b6582578008e3b47648a65063f73d/src/transformers/generation/utils.py#L745 + + Adds support for `token_idx`, which is necessary for using static shapes. + """ + # mark to identify starting from second token + model_kwargs["first_token"] = False + # update past_key_values + model_kwargs["past_key_values"] = self._extract_past_from_model_output( + outputs, standardize_cache_format=standardize_cache_format + ) + if getattr(outputs, "state", None) is not None: + model_kwargs["state"] = outputs.state + + # update token_type_ids with last value + if "token_type_ids" in model_kwargs: + token_type_ids = model_kwargs["token_type_ids"] + model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1) + + token_idx = model_kwargs.get("token_idx", None) + + if not is_encoder_decoder: + # update attention mask + if "attention_mask" in model_kwargs: + attention_mask = model_kwargs["attention_mask"] + if token_idx is not None: + attention_mask.index_fill_(1, token_idx, 1) + else: + attention_mask = torch.cat( + [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 + ) + model_kwargs["attention_mask"] = attention_mask + else: + # update decoder attention mask + if "attention_mask" in model_kwargs: + attention_mask = model_kwargs["attention_mask"] + if token_idx is not None: + attention_mask.index_fill_(1, token_idx, 1) + model_kwargs["attention_mask"] = attention_mask + if "decoder_attention_mask" in model_kwargs: + decoder_attention_mask = model_kwargs["decoder_attention_mask"] + if token_idx is not None: + decoder_attention_mask.index_fill_(1, token_idx, 1) + else: + decoder_attention_mask = torch.cat( + [ + decoder_attention_mask, + decoder_attention_mask.new_ones((decoder_attention_mask.shape[0], 1)), + ], + dim=-1, + ) + model_kwargs["decoder_attention_mask"] = decoder_attention_mask + + if token_idx is not None: + token_idx.add_(1) + + return model_kwargs + @torch.no_grad() def update_model_kwargs_for_bucketing( self, params, input_ids, model_kwargs, pad_token_id, bucket_size, reduce_recompile=False @@ -498,9 +515,9 @@ def generate( self.generation_config.static_shapes = generation_config.static_shapes if generation_config.ignore_eos is None: generation_config.ignore_eos = kwargs.get("ignore_eos", lazy_mode) - generation_config.validate() model_kwargs = generation_config.update(**kwargs) # All unused kwargs must be model kwargs self._validate_model_kwargs(model_kwargs.copy()) + # 2. Set generation parameters if not already defined logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() @@ -629,6 +646,7 @@ def generate( model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation( inputs_tensor, model_kwargs, model_input_name ) + # 5. Prepare `input_ids` which will be used for auto-regressive generation if self.config.is_encoder_decoder: input_ids, model_kwargs = self._prepare_decoder_input_ids_for_generation( @@ -661,6 +679,25 @@ def generate( generation_config.max_length = input_ids_length else: generation_config.max_length = generation_config.max_new_tokens + input_ids_length + # otherwise the total length [inputs-embeds-len + new-tokens-len] will go beyond indicated `max_length` + elif ( + model_input_name == "inputs_embeds" + and inputs_tensor.shape[:-1] != input_ids.shape + and not self.config.is_encoder_decoder + ): + generation_config.max_length -= inputs_tensor.shape[1] + + # if we don't pass `past_key_values` and a cache_implementation is specified + if generation_config.cache_implementation in NEED_SETUP_CACHE_CLASSES_MAPPING and not model_kwargs.get( + "past_key_values", False + ): + cache_cls = NEED_SETUP_CACHE_CLASSES_MAPPING[generation_config.cache_implementation] + if not callable(getattr(self, "_setup_cache", None)): + raise ValueError( + "The `generation_config` defines a `cache_implementation` that is not compatible with this model." + " Make sure it has a `_setup_cache` function." + ) + self._setup_cache(cache_cls, max_batch_size=batch_size, max_cache_len=generation_config.max_length) self._validate_generated_length( generation_config, @@ -788,6 +825,7 @@ def generate( pad_token_id=generation_config.pad_token_id, eos_token_id=generation_config.eos_token_id, output_scores=generation_config.output_scores, + output_logits=generation_config.output_logits, return_dict_in_generate=generation_config.return_dict_in_generate, synced_gpus=synced_gpus, streamer=streamer, @@ -802,6 +840,7 @@ def generate( pad_token_id=generation_config.pad_token_id, eos_token_id=generation_config.eos_token_id, output_scores=generation_config.output_scores, + output_logits=generation_config.output_logits, return_dict_in_generate=generation_config.return_dict_in_generate, synced_gpus=synced_gpus, streamer=streamer, @@ -825,6 +864,7 @@ def generate( pad_token_id=generation_config.pad_token_id, eos_token_id=generation_config.eos_token_id, output_scores=generation_config.output_scores, + output_logits=generation_config.output_logits, return_dict_in_generate=generation_config.return_dict_in_generate, synced_gpus=synced_gpus, streamer=streamer, @@ -855,6 +895,7 @@ def generate( pad_token_id=generation_config.pad_token_id, eos_token_id=generation_config.eos_token_id, output_scores=generation_config.output_scores, + output_logits=generation_config.output_logits, return_dict_in_generate=generation_config.return_dict_in_generate, synced_gpus=synced_gpus, streamer=streamer, @@ -892,8 +933,10 @@ def generate( pad_token_id=generation_config.pad_token_id, eos_token_id=generation_config.eos_token_id, output_scores=generation_config.output_scores, + output_logits=generation_config.output_logits, return_dict_in_generate=generation_config.return_dict_in_generate, synced_gpus=synced_gpus, + sequential=generation_config.low_memory, lazy_mode=lazy_mode, profiling_warmup_steps=profiling_warmup_steps, profiling_steps=profiling_steps, @@ -933,6 +976,7 @@ def generate( pad_token_id=generation_config.pad_token_id, eos_token_id=generation_config.eos_token_id, output_scores=generation_config.output_scores, + output_logits=generation_config.output_logits, return_dict_in_generate=generation_config.return_dict_in_generate, synced_gpus=synced_gpus, lazy_mode=lazy_mode, @@ -969,6 +1013,7 @@ def generate( pad_token_id=generation_config.pad_token_id, eos_token_id=generation_config.eos_token_id, output_scores=generation_config.output_scores, + output_logits=generation_config.output_logits, return_dict_in_generate=generation_config.return_dict_in_generate, synced_gpus=synced_gpus, lazy_mode=lazy_mode, @@ -1045,6 +1090,7 @@ def typeerror(): pad_token_id=generation_config.pad_token_id, eos_token_id=generation_config.eos_token_id, output_scores=generation_config.output_scores, + output_logits=generation_config.output_logits, return_dict_in_generate=generation_config.return_dict_in_generate, synced_gpus=synced_gpus, lazy_mode=lazy_mode, @@ -1067,6 +1113,7 @@ def contrastive_search( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, output_scores: Optional[bool] = None, + output_logits: Optional[bool] = None, return_dict_in_generate: Optional[bool] = None, synced_gpus: bool = False, streamer: Optional["BaseStreamer"] = None, @@ -1117,6 +1164,9 @@ def contrastive_search( for more details. output_scores (`bool`, *optional*, defaults to `False`): Whether or not to return the prediction scores. See `scores` under returned tensors for more details. + output_logits (`bool`, *optional*, defaults to `False`): + Whether or not to return the raw prediction logit scores. See `logits` under returned tensors + for more details. return_dict_in_generate (`bool`, *optional*, defaults to `False`): Whether or not to return a [`transformers.generationutils.ModelOutput`] instead of a plain tuple. synced_gpus (`bool`, *optional*, defaults to `False`): @@ -1178,6 +1228,7 @@ def greedy_search( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, output_scores: Optional[bool] = None, + output_logits: Optional[bool] = None, return_dict_in_generate: Optional[bool] = None, synced_gpus: bool = False, streamer: Optional["BaseStreamer"] = None, @@ -1224,6 +1275,9 @@ def greedy_search( for more details. output_scores (`bool`, *optional*, defaults to `False`): Whether or not to return the prediction scores. See `scores` under returned tensors for more details. + output_logits (`bool`, *optional*, defaults to `False`): + Whether or not to return the raw prediction logit scores. See `logits` under returned tensors + for more details. return_dict_in_generate (`bool`, *optional*, defaults to `False`): Whether or not to return a [`transformers.generationutils.ModelOutput`] instead of a plain tuple. synced_gpus (`bool`, *optional*, defaults to `False`): @@ -1317,6 +1371,7 @@ def greedy_search( ) # init attention / hidden states / scores tuples + raw_logits = () if (return_dict_in_generate and output_logits) else None scores = () if (return_dict_in_generate and output_scores) else None decoder_attentions = () if (return_dict_in_generate and output_attentions) else None cross_attentions = () if (return_dict_in_generate and output_attentions) else None @@ -1423,6 +1478,8 @@ def greedy_search( if return_dict_in_generate: if output_scores: scores += (next_tokens_scores,) + if output_logits: + raw_logits += (next_token_logits,) if output_attentions: decoder_attentions += ( (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) @@ -1486,6 +1543,7 @@ def greedy_search( return GenerateEncoderDecoderOutput( sequences=input_ids, scores=scores, + logits=raw_logits, encoder_attentions=encoder_attentions, encoder_hidden_states=encoder_hidden_states, decoder_attentions=decoder_attentions, @@ -1497,6 +1555,7 @@ def greedy_search( return GenerateDecoderOnlyOutput( sequences=input_ids, scores=scores, + logits=raw_logits, attentions=decoder_attentions, hidden_states=decoder_hidden_states, past_key_values=model_kwargs.get("past_key_values"), @@ -1516,6 +1575,7 @@ def sample( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, output_scores: Optional[bool] = None, + output_logits: Optional[bool] = None, return_dict_in_generate: Optional[bool] = None, synced_gpus: bool = False, streamer: Optional["BaseStreamer"] = None, @@ -1565,6 +1625,9 @@ def sample( for more details. output_scores (`bool`, *optional*, defaults to `False`): Whether or not to return the prediction scores. See `scores` under returned tensors for more details. + output_logits (`bool`, *optional*, defaults to `False`): + Whether or not to return the raw prediction logit scores. See `logits` under returned tensors for + more details. return_dict_in_generate (`bool`, *optional*, defaults to `False`): Whether or not to return a [`transformers.generationutils.ModelOutput`] instead of a plain tuple. synced_gpus (`bool`, *optional*, defaults to `False`): @@ -1663,6 +1726,7 @@ def sample( eos_token_id = [eos_token_id] eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None output_scores = output_scores if output_scores is not None else self.generation_config.output_scores + output_logits = output_logits if output_logits is not None else self.generation_config.output_logits output_attentions = ( output_attentions if output_attentions is not None else self.generation_config.output_attentions ) @@ -1677,6 +1741,7 @@ def sample( # init attention / hidden states / scores tuples scores = () if (return_dict_in_generate and output_scores) else None + raw_logits = () if (return_dict_in_generate and output_logits) else None decoder_attentions = () if (return_dict_in_generate and output_attentions) else None cross_attentions = () if (return_dict_in_generate and output_attentions) else None decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None @@ -1746,6 +1811,8 @@ def sample( if return_dict_in_generate: if output_scores: scores += (next_token_scores,) + if output_logits: + raw_logits += (next_token_logits,) if output_attentions: decoder_attentions += ( (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) @@ -1812,6 +1879,7 @@ def sample( return GenerateEncoderDecoderOutput( sequences=input_ids, scores=scores, + logits=raw_logits, encoder_attentions=encoder_attentions, encoder_hidden_states=encoder_hidden_states, decoder_attentions=decoder_attentions, @@ -1823,6 +1891,7 @@ def sample( return GenerateDecoderOnlyOutput( sequences=input_ids, scores=scores, + logits=raw_logits, attentions=decoder_attentions, hidden_states=decoder_hidden_states, past_key_values=model_kwargs.get("past_key_values"), @@ -1842,8 +1911,10 @@ def beam_search( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, output_scores: Optional[bool] = None, + output_logits: Optional[bool] = None, return_dict_in_generate: Optional[bool] = None, synced_gpus: bool = False, + sequential: Optional[bool] = None, lazy_mode: Optional[bool] = False, profiling_warmup_steps: Optional[int] = 0, profiling_steps: Optional[int] = 0, @@ -1888,10 +1959,17 @@ def beam_search( for more details. output_scores (`bool`, *optional*, defaults to `False`): Whether or not to return the prediction scores. See `scores` under returned tensors for more details. + output_logits (`bool`, *optional*, defaults to `False`): + Whether or not to return the raw prediction logit scores. See `logits` under returned tensors for + more details. return_dict_in_generate (`bool`, *optional*, defaults to `False`): Whether or not to return a [`transformers.generationutils.ModelOutput`] instead of a plain tuple. synced_gpus (`bool`, *optional*, defaults to `False`): Whether to continue running the while loop until max_length (needed for ZeRO stage 3) + sequential (`bool`, defaults to `False`): + By default, beam search has `batch_size * num_beams` as effective batch size (see `beam_search()` for + more details). This flag will avoid parallelizing the beam search and will instead run beam search + sequentially. lazy_mode (`bool`, *optional*, defaults to `False`): Whether the run is executed in lazy mode or not (i.e. eager mode). profiling_warmup_steps (`int`, *optional*, defaults to 0): @@ -1962,6 +2040,7 @@ def beam_search( # init values logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() + sequential = sequential if sequential is not None else self.generation_config.low_memory if max_length is not None: warnings.warn( ( @@ -1978,6 +2057,7 @@ def beam_search( if isinstance(eos_token_id, int): eos_token_id = [eos_token_id] output_scores = output_scores if output_scores is not None else self.generation_config.output_scores + output_logits = output_logits if output_logits is not None else self.generation_config.output_logits output_attentions = ( output_attentions if output_attentions is not None else self.generation_config.output_attentions ) @@ -2006,6 +2086,7 @@ def beam_search( # init attention / hidden states / scores tuples scores = () if (return_dict_in_generate and output_scores) else None + raw_logits = () if (return_dict_in_generate and output_logits) else None beam_indices = ( tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None ) @@ -2148,14 +2229,49 @@ def expand_if_needed(tensor, new_size, value, dim=-1): model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) - hpu_graphs_kwargs = self._get_hpu_graphs_kwargs(model_kwargs) - outputs = self( - **model_inputs, - return_dict=True, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - **hpu_graphs_kwargs, - ) + # if sequential is True, split the input to batches of batch_size and run sequentially + if sequential: + if any( + model_name in self.__class__.__name__.lower() + for model_name in [ + "fsmt", + "reformer", + "bloom", + "ctrl", + "gpt_bigcode", + "transo_xl", + "xlnet", + "cpm", + ] + ): + raise RuntimeError( + f"Currently generation for {self.__class__.__name__} is not supported " + f"for `low_memory beam_search`. Please open an issue on GitHub if you need this feature." + ) + + inputs_per_sub_batches = _split_model_inputs( + model_inputs, split_size=batch_size, full_batch_size=batch_beam_size + ) + outputs_per_sub_batch = [ + self( + **inputs_per_sub_batch, + return_dict=True, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + for inputs_per_sub_batch in inputs_per_sub_batches + ] + + outputs = stack_model_outputs(outputs_per_sub_batch) + else: + hpu_graphs_kwargs = self._get_hpu_graphs_kwargs(model_kwargs) + outputs = self( + **model_inputs, + return_dict=True, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + **hpu_graphs_kwargs, + ) if synced_gpus and this_peer_finished: cur_len = cur_len + 1 @@ -2183,13 +2299,14 @@ def expand_if_needed(tensor, new_size, value, dim=-1): if return_dict_in_generate: if output_scores: scores += (next_token_scores_processed,) + if output_logits: + raw_logits += (next_token_logits,) if output_attentions: decoder_attentions += ( (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) ) if self.config.is_encoder_decoder: cross_attentions += (outputs.cross_attentions,) - if output_hidden_states: decoder_hidden_states += ( (outputs.decoder_hidden_states,) @@ -2347,6 +2464,7 @@ def move(obj, device): sequences=sequence_outputs["sequences"], sequences_scores=sequence_outputs["sequence_scores"], scores=scores, + logits=raw_logits, beam_indices=sequence_outputs["beam_indices"], encoder_attentions=encoder_attentions, encoder_hidden_states=encoder_hidden_states, @@ -2360,6 +2478,7 @@ def move(obj, device): sequences=sequence_outputs["sequences"], sequences_scores=sequence_outputs["sequence_scores"], scores=scores, + logits=raw_logits, beam_indices=sequence_outputs["beam_indices"], attentions=decoder_attentions, hidden_states=decoder_hidden_states, @@ -2381,6 +2500,7 @@ def beam_sample( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, output_scores: Optional[bool] = None, + output_logits: Optional[bool] = None, return_dict_in_generate: Optional[bool] = None, synced_gpus: bool = False, lazy_mode: Optional[bool] = False, @@ -2431,6 +2551,9 @@ def beam_sample( for more details. output_scores (`bool`, *optional*, defaults to `False`): Whether or not to return the prediction scores. See `scores` under returned tensors for more details. + output_logits (`bool`, *optional*, defaults to `False`): + Whether or not to return the raw prediction logit scores. See `logits` under returned tensors for + more details. return_dict_in_generate (`bool`, *optional*, defaults to `False`): Whether or not to return a [`transformers.generationutils.ModelOutput`] instead of a plain tuple. synced_gpus (`bool`, *optional*, defaults to `False`): @@ -2527,6 +2650,7 @@ def group_beam_search( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, output_scores: Optional[bool] = None, + output_logits: Optional[bool] = None, return_dict_in_generate: Optional[bool] = None, synced_gpus: bool = False, lazy_mode: Optional[bool] = False, @@ -2573,6 +2697,9 @@ def group_beam_search( for more details. output_scores (`bool`, *optional*, defaults to `False`): Whether or not to return the prediction scores. See `scores` under returned tensors for more details. + output_logits (`bool`, *optional*, defaults to `False`): + Whether or not to return the raw prediction logit scores. See `logits` under returned tensors for + more details. return_dict_in_generate (`bool`, *optional*, defaults to `False`): Whether or not to return a [`transformers.generationutils.ModelOutput`] instead of a plain tuple. synced_gpus (`bool`, *optional*, defaults to `False`): @@ -2665,6 +2792,7 @@ def constrained_beam_search( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, output_scores: Optional[bool] = None, + output_logits: Optional[bool] = None, return_dict_in_generate: Optional[bool] = None, synced_gpus: Optional[bool] = None, lazy_mode: Optional[bool] = False, @@ -2716,6 +2844,9 @@ def constrained_beam_search( for more details. output_scores (`bool`, *optional*, defaults to `False`): Whether or not to return the prediction scores. See `scores` under returned tensors for more details. + output_logits (`bool`, *optional*, defaults to `False`): + Whether or not to return the raw prediction logit scores. See `logits` under returned tensors for + more details. return_dict_in_generate (`bool`, *optional*, defaults to `False`): Whether or not to return a [`transformers.generationutils.ModelOutput`] instead of a plain tuple. synced_gpus (`bool`, *optional*, defaults to `False`): @@ -2810,6 +2941,7 @@ def constrained_beam_search( if isinstance(eos_token_id, int): eos_token_id = [eos_token_id] output_scores = output_scores if output_scores is not None else self.generation_config.output_scores + output_logits = output_logits if output_logits is not None else self.generation_config.output_logits output_attentions = ( output_attentions if output_attentions is not None else self.generation_config.output_attentions ) @@ -2838,6 +2970,7 @@ def constrained_beam_search( # init attention / hidden states / scores tuples scores = () if (return_dict_in_generate and output_scores) else None + raw_logits = () if (return_dict_in_generate and output_logits) else None beam_indices = ( tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None ) @@ -2911,6 +3044,8 @@ def constrained_beam_search( if return_dict_in_generate: if output_scores: scores += (next_token_scores,) + if output_logits: + raw_logits += (next_token_logits,) if output_attentions: decoder_attentions += ( (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) @@ -3003,6 +3138,7 @@ def constrained_beam_search( sequences=sequence_outputs["sequences"], sequences_scores=sequence_outputs["sequence_scores"], scores=scores, + logits=raw_logits, beam_indices=sequence_outputs["beam_indices"], encoder_attentions=encoder_attentions, encoder_hidden_states=encoder_hidden_states, @@ -3016,6 +3152,7 @@ def constrained_beam_search( sequences=sequence_outputs["sequences"], sequences_scores=sequence_outputs["sequence_scores"], scores=scores, + logits=raw_logits, beam_indices=sequence_outputs["beam_indices"], attentions=decoder_attentions, hidden_states=decoder_hidden_states, @@ -3038,6 +3175,7 @@ def assisted_decoding( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, output_scores: Optional[bool] = None, + output_logits: Optional[bool] = None, return_dict_in_generate: Optional[bool] = None, synced_gpus: bool = False, lazy_mode: Optional[bool] = False, @@ -3095,6 +3233,9 @@ def assisted_decoding( for more details. output_scores (`bool`, *optional*, defaults to `False`): Whether or not to return the prediction scores. See `scores` under returned tensors for more details. + output_logits (`bool`, *optional*, defaults to `False`): + Whether or not to return the raw prediction logit scores. See `logits` under returned tensors for + more details. return_dict_in_generate (`bool`, *optional*, defaults to `False`): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. synced_gpus (`bool`, *optional*, defaults to `False`): diff --git a/optimum/habana/transformers/integrations/deepspeed.py b/optimum/habana/transformers/integrations/deepspeed.py index eaeb452110..716d7cd141 100644 --- a/optimum/habana/transformers/integrations/deepspeed.py +++ b/optimum/habana/transformers/integrations/deepspeed.py @@ -62,14 +62,25 @@ def trainer_config_process(self, args, auto_find_batch_size=False): "per_device_train_batch_size", not auto_find_batch_size, ) - self.fill_match("gradient_accumulation_steps", args.gradient_accumulation_steps, "gradient_accumulation_steps") self.fill_match( - "train_batch_size", train_batch_size, "train_batch_size (calculated)", not auto_find_batch_size + "gradient_accumulation_steps", + args.gradient_accumulation_steps, + "gradient_accumulation_steps", + ) + self.fill_match( + "train_batch_size", + train_batch_size, + "train_batch_size (calculated)", + not auto_find_batch_size, ) self.fill_match("gradient_clipping", args.max_grad_norm, "max_grad_norm") self.fill_match("optimizer.params.lr", args.learning_rate, "learning_rate") - self.fill_match("optimizer.params.betas", [args.adam_beta1, args.adam_beta2], "adam_beta1+adam_beta2") + self.fill_match( + "optimizer.params.betas", + [args.adam_beta1, args.adam_beta2], + "adam_beta1+adam_beta2", + ) self.fill_match("optimizer.params.eps", args.adam_epsilon, "adam_epsilon") self.fill_match("optimizer.params.weight_decay", args.weight_decay, "weight_decay") diff --git a/optimum/habana/transformers/modeling_utils.py b/optimum/habana/transformers/modeling_utils.py index bf7235ca5f..701128ec1e 100644 --- a/optimum/habana/transformers/modeling_utils.py +++ b/optimum/habana/transformers/modeling_utils.py @@ -20,7 +20,6 @@ GaudiGenerationMixin, gaudi_MaxLengthCriteria_call, gaudi_MaxNewTokensCriteria_call, - gaudi_StoppingCriteriaList_call, ) from .models import ( GaudiBloomForCausalLM, @@ -37,9 +36,12 @@ GaudiGPTNeoXForCausalLM, GaudiLlamaAttention, GaudiLlamaDecoderLayer, + GaudiLlamaDynamicNTKScalingRotaryEmbedding, GaudiLlamaForCausalLM, + GaudiLlamaLinearScalingRotaryEmbedding, GaudiLlamaMLP, GaudiLlamaModel, + GaudiLlamaRotaryEmbedding, GaudiMistralForCausalLM, GaudiMixtralForCausalLM, GaudiMptForCausalLM, @@ -72,6 +74,7 @@ gaudi_bloom_convert_to_bloom_cache, gaudi_bloom_convert_to_standard_cache, gaudi_bloom_model_forward, + gaudi_check_and_enable_sdpa, gaudi_codegen_block_forward, gaudi_codegen_model_forward, gaudi_conv1d_forward, @@ -175,7 +178,6 @@ def adapt_transformers_to_gaudi(): transformers.modeling_utils.GenerationConfig = GaudiGenerationConfig transformers.generation.MaxLengthCriteria.__call__ = gaudi_MaxLengthCriteria_call transformers.generation.MaxNewTokensCriteria.__call__ = gaudi_MaxNewTokensCriteria_call - transformers.generation.StoppingCriteriaList.__call__ = gaudi_StoppingCriteriaList_call # Optimization for BLOOM generation on Gaudi transformers.models.bloom.modeling_bloom.BloomAttention.forward = gaudi_bloom_attention_forward @@ -215,6 +217,10 @@ def adapt_transformers_to_gaudi(): # so that Torch Autocast is disabled for specific parts of the code transformers.modeling_utils.ModuleUtilsMixin.invert_attention_mask = gaudi_invert_attention_mask transformers.modeling_utils.ModuleUtilsMixin.get_extended_attention_mask = gaudi_get_extended_attention_mask + + # Override sdpa check on Gaudi + transformers.modeling_utils.PreTrainedModel._check_and_enable_sdpa = gaudi_check_and_enable_sdpa + # AlbertModel.forward does not rely on get_extended_attention_mask so it also needs to be replaced transformers.models.albert.modeling_albert.AlbertModel.forward = gaudi_albert_forward @@ -264,7 +270,11 @@ def adapt_transformers_to_gaudi(): transformers.models.llama.modeling_llama.LlamaAttention = GaudiLlamaAttention transformers.models.llama.modeling_llama.LlamaMLP = GaudiLlamaMLP transformers.models.llama.modeling_llama.LlamaDecoderLayer = GaudiLlamaDecoderLayer - + transformers.models.llama.modeling_llama.LlamaRotaryEmbedding = GaudiLlamaRotaryEmbedding + transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding = GaudiLlamaLinearScalingRotaryEmbedding + transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding = ( + GaudiLlamaDynamicNTKScalingRotaryEmbedding + ) transformers.models.llama.modeling_llama.LlamaRMSNorm.forward = gaudi_llama_rmsnorm_forward # Optimization for falcon generation on Gaudi diff --git a/optimum/habana/transformers/models/__init__.py b/optimum/habana/transformers/models/__init__.py index 98fc51eae7..fff777f72a 100644 --- a/optimum/habana/transformers/models/__init__.py +++ b/optimum/habana/transformers/models/__init__.py @@ -71,9 +71,12 @@ from .llama import ( GaudiLlamaAttention, GaudiLlamaDecoderLayer, + GaudiLlamaDynamicNTKScalingRotaryEmbedding, GaudiLlamaForCausalLM, + GaudiLlamaLinearScalingRotaryEmbedding, GaudiLlamaMLP, GaudiLlamaModel, + GaudiLlamaRotaryEmbedding, gaudi_llama_rmsnorm_forward, ) from .mistral import ( @@ -90,7 +93,12 @@ gaudi_mixtral_model_forward, gaudi_mixtral_rmsnorm_forward, ) -from .modeling_all_models import gaudi_conv1d_forward, gaudi_get_extended_attention_mask, gaudi_invert_attention_mask +from .modeling_all_models import ( + gaudi_check_and_enable_sdpa, + gaudi_conv1d_forward, + gaudi_get_extended_attention_mask, + gaudi_invert_attention_mask, +) from .mpt import ( GaudiMptForCausalLM, GaudiMptModel, diff --git a/optimum/habana/transformers/models/blip/modeling_blip_text.py b/optimum/habana/transformers/models/blip/modeling_blip_text.py index 382911ae93..23d4ee3f3c 100644 --- a/optimum/habana/transformers/models/blip/modeling_blip_text.py +++ b/optimum/habana/transformers/models/blip/modeling_blip_text.py @@ -470,7 +470,7 @@ def gaudi_BlipTextLMHead_forward( # we are doing next-token prediction; shift prediction scores and input ids by one shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous() labels = labels[:, 1:].contiguous().to(shifted_prediction_scores.device) - loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1) + loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=self.label_smoothing) lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) if reduction == "none": lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1) @@ -493,7 +493,7 @@ def gaudi_BlipTextLMHead_prepare_inputs_for_generation( self, input_ids, past_key_values=None, attention_mask=None, token_idx=None, **model_kwargs ): """ - Copied from BlipTextLMHeadModel.forward: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/blip/modeling_blip_text.py#L910 + Copied from BlipTextLMHeadModel.prepare_inputs_for_generation: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/blip/modeling_blip_text.py#L910 The only differences are: - add token_idx support, add position_ids """ diff --git a/optimum/habana/transformers/models/llama/__init__.py b/optimum/habana/transformers/models/llama/__init__.py index 7d93b38078..20703ffd09 100644 --- a/optimum/habana/transformers/models/llama/__init__.py +++ b/optimum/habana/transformers/models/llama/__init__.py @@ -1,8 +1,11 @@ from .modeling_llama import ( GaudiLlamaAttention, GaudiLlamaDecoderLayer, + GaudiLlamaDynamicNTKScalingRotaryEmbedding, GaudiLlamaForCausalLM, + GaudiLlamaLinearScalingRotaryEmbedding, GaudiLlamaMLP, GaudiLlamaModel, + GaudiLlamaRotaryEmbedding, gaudi_llama_rmsnorm_forward, ) diff --git a/optimum/habana/transformers/models/llama/modeling_llama.py b/optimum/habana/transformers/models/llama/modeling_llama.py index b6411e6b54..4f3e4625a4 100755 --- a/optimum/habana/transformers/models/llama/modeling_llama.py +++ b/optimum/habana/transformers/models/llama/modeling_llama.py @@ -4,8 +4,7 @@ import torch import torch.nn.functional as F -from transformers.cache_utils import Cache, DynamicCache -from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask_for_sdpa +from transformers.cache_utils import Cache, DynamicCache, StaticCache from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast from transformers.models.llama.configuration_llama import LlamaConfig from transformers.models.llama.modeling_llama import ( @@ -85,6 +84,41 @@ def gaudi_llama_rmsnorm_forward(self, hidden_states): return self.weight * hidden_states.to(input_dtype) +class GaudiLlamaMLP(LlamaMLP): + def pre_mlp_forward(self, x): + if self.config.pretraining_tp > 1: + slice = self.intermediate_size // self.config.pretraining_tp + gate_proj_slices = self.gate_proj.weight.split(slice, dim=0) + up_proj_slices = self.up_proj.weight.split(slice, dim=0) + down_proj_slices = self.down_proj.weight.split(slice, dim=1) + + gate_proj = torch.cat( + [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1 + ) + up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1) + + intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2) + down_proj = [ + F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp) + ] + output = sum(down_proj) + else: + input = self.act_fn(self.gate_proj(x)) * self.up_proj(x) + output = self.down_proj(input) + return output + + def mlp_all_reduce(self, x): + if hasattr(self.down_proj, "all_reduce"): + self.down_proj.all_reduce(x) + + def post_mlp_forward(self, x): + if self.config.pretraining_tp > 1: + return x + if hasattr(self.down_proj, "post_all_reduce"): + return self.down_proj.post_all_reduce(x) + return x + + def gaudi_llama_repeat_kv( query_states: torch.Tensor, key_states: torch.Tensor, @@ -154,6 +188,76 @@ def forward(self, cur, dim, idx): return update(self.cache, cur, dim, idx, self.inp_seq_len) +class GaudiLlamaRotaryEmbedding(torch.nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + super().__init__() + + self.scaling_factor = scaling_factor + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("_cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("_sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self._cos_cached[:seq_len].to(dtype=x.dtype), + self._sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +class GaudiLlamaLinearScalingRotaryEmbedding(GaudiLlamaRotaryEmbedding): + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + t = t / self.scaling_factor + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("_cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("_sin_cached", emb.sin().to(dtype), persistent=False) + + +class GaudiLlamaDynamicNTKScalingRotaryEmbedding(GaudiLlamaRotaryEmbedding): + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + + if seq_len > self.max_position_embeddings: + base = self.base * ( + (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) + ) ** (self.dim / (self.dim - 2)) + inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("_cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("_sin_cached", emb.sin().to(dtype), persistent=False) + + class GaudiLlamaAttention(LlamaAttention): def __init__(self, config: LlamaConfig, layer_idx: Optional[int] = None): super().__init__(config, layer_idx) @@ -202,6 +306,7 @@ def pre_attn_forward( past_key_value: Optional[Cache] = None, output_attentions: bool = False, use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, token_idx: Optional[torch.Tensor] = None, attn_softmax_bf16: Optional[bool] = False, reuse_cache: Optional[bool] = False, @@ -220,11 +325,6 @@ def pre_attn_forward( - add new args use_flash_attention - add new arg flash_attention_recompute """ - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - bsz, q_len, _ = hidden_states.size() if self.config.pretraining_tp > 1: @@ -248,7 +348,6 @@ def pre_attn_forward( query_states = self.q_proj(hidden_states) key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) # TODO: update when auto mp params is enabled in DeepSpeed (cf. https://github.com/HabanaAI/DeepSpeed/blob/94309c7b5dfc1a69858f5c9f25737b2f81a332a5/deepspeed/module_inject/replace_module.py#L440) key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) @@ -256,12 +355,6 @@ def pre_attn_forward( kv_seq_len = key_states.shape[-2] if past_key_value is not None: - if self.layer_idx is None: - raise ValueError( - f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " - "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " - "with a layer index." - ) if token_idx is None: if hasattr(past_key_value, "get_usable_length"): kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) @@ -322,32 +415,11 @@ def pre_attn_forward( attn_weights = self.matmul_qk(query_states, key_states.transpose(-2, -1)) * self.norm_factor - if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len) and attn_weights.size() != ( - bsz, - self.num_key_value_heads, - self.num_key_value_groups, - q_len, - kv_seq_len, - ): - raise ValueError( - f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)} or" - f" {(bsz, self.num_key_value_heads, self.num_key_value_groups, q_len, kv_seq_len)}, but is" - f" {attn_weights.size()}" - ) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len) and attention_mask.size() != ( - bsz, - 1, - 1, - q_len, - kv_seq_len, - ): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)} or {(bsz, 1, 1, q_len, kv_seq_len)}," - f" but is {attention_mask.size()}" - ) - attn_weights = attn_weights + attention_mask + if attention_mask is not None: # no matter the length, we just slice it + causal_mask = attention_mask + if cache_position is not None: + causal_mask = attention_mask[:, :, cache_position, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask if attn_softmax_bf16: attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1, dtype=query_states.dtype) @@ -387,41 +459,6 @@ def post_attn_forward(self, attn_output): return attn_output -class GaudiLlamaMLP(LlamaMLP): - def pre_mlp_forward(self, x): - if self.config.pretraining_tp > 1: - slice = self.intermediate_size // self.config.pretraining_tp - gate_proj_slices = self.gate_proj.weight.split(slice, dim=0) - up_proj_slices = self.up_proj.weight.split(slice, dim=0) - down_proj_slices = self.down_proj.weight.split(slice, dim=1) - - gate_proj = torch.cat( - [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1 - ) - up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1) - - intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2) - down_proj = [ - F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp) - ] - output = sum(down_proj) - else: - input = self.act_fn(self.gate_proj(x)) * self.up_proj(x) - output = self.down_proj(input) - return output - - def mlp_all_reduce(self, x): - if hasattr(self.down_proj, "all_reduce"): - self.down_proj.all_reduce(x) - - def post_mlp_forward(self, x): - if self.config.pretraining_tp > 1: - return x - if hasattr(self.down_proj, "post_all_reduce"): - return self.down_proj.post_all_reduce(x) - return x - - class GaudiLlamaDecoderLayer(LlamaDecoderLayer): def __init__(self, config: LlamaConfig, layer_idx: int): super(LlamaDecoderLayer, self).__init__() @@ -450,6 +487,7 @@ def forward( past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, token_idx: Optional[torch.Tensor] = None, attn_softmax_bf16: Optional[bool] = False, reuse_cache: Optional[bool] = False, @@ -480,6 +518,7 @@ def forward( past_key_value, output_attentions, use_cache, + cache_position, token_idx, attn_softmax_bf16, reuse_cache, @@ -488,6 +527,7 @@ def forward( cache_idx=cache_idx, **kwargs, ) + self.self_attn.attention_all_reduce(output_pre_attn) output_post_attn_pre_mlp, residual_mlp = self.post_attn_pre_mlp(output_pre_attn, residual) self.mlp.mlp_all_reduce(output_post_attn_pre_mlp) @@ -510,6 +550,7 @@ def pre_attn( past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, token_idx: Optional[torch.Tensor] = None, attn_softmax_bf16: Optional[bool] = False, reuse_cache: Optional[bool] = False, @@ -525,6 +566,7 @@ def pre_attn( past_key_value, output_attentions, use_cache, + cache_position, token_idx, attn_softmax_bf16, reuse_cache, @@ -574,6 +616,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, token_idx: Optional[torch.Tensor] = None, attn_softmax_bf16: Optional[bool] = False, reuse_cache: Optional[bool] = False, @@ -595,12 +638,12 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - # retrieve input_ids and inputs_embeds - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one" + ) elif input_ids is not None: batch_size, seq_length = input_ids.shape[:2] elif inputs_embeds is not None: @@ -608,54 +651,55 @@ def forward( else: raise ValueError("You have to specify either input_ids or inputs_embeds") - if self.gradient_checkpointing and self.training: - if use_cache: - logger.warning_once( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - - past_key_values_length = 0 - use_legacy_cache = True - use_new_cache = False # Ignoring new Cache path for HPU - if past_key_values is not None: - if use_cache: - if reuse_cache: - past_key_values_length = past_key_values[0][0][2] - else: - if use_new_cache: - use_legacy_cache = not isinstance(past_key_values, Cache) - if use_legacy_cache: - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - past_key_values_length = past_key_values.get_usable_length(seq_length) - else: - past_key_values_length = past_key_values[0][0].shape[2] - - if position_ids is None: - device = input_ids.device if input_ids is not None else inputs_embeds.device - position_ids = torch.arange( - past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." ) - position_ids = position_ids.unsqueeze(0) + use_cache = False if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) + ignore_cache_position = True # Ignoring cache position for HPU + use_new_cache = False # Ignoring new Cache path for HPU + past_seen_tokens = 0 + + if past_key_values is not None and use_cache: # kept for BC (cache positions) + if reuse_cache: + past_seen_tokens = past_key_values[0][0][2] + else: + if use_new_cache: + if not isinstance(past_key_values, StaticCache): + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + past_seen_tokens = past_key_values.get_seq_length() + else: + past_seen_tokens = past_key_values[0][0].shape[2] + + if ignore_cache_position is False: + if cache_position is None: + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + if position_ids is None and cache_position: + position_ids = cache_position.unsqueeze(0) - if self._use_sdpa and not output_attentions: - # output_attentions=True can not be supported when using SDPA, and we fall back on - # the manual implementation that requires a 4D causal mask in all cases. - attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + else: + if position_ids is None: + position_ids = torch.arange( + past_seen_tokens, seq_length + past_seen_tokens, dtype=torch.long, device=inputs_embeds.device + ) + position_ids = position_ids.unsqueeze(0) + cache_position = None + + # HPU specific mask generation + if ignore_cache_position: + causal_mask = _gaudi_prepare_4d_causal_attention_mask( attention_mask, - (batch_size, seq_length), + input_ids.shape if input_ids is not None else (batch_size, seq_length), inputs_embeds, - past_key_values_length, + past_seen_tokens, ) else: - # 4d mask is passed through the layers - attention_mask = _gaudi_prepare_4d_causal_attention_mask( - attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length - ) - + causal_mask = self._update_causal_mask(attention_mask, inputs_embeds) # embed positions hidden_states = inputs_embeds @@ -672,11 +716,12 @@ def forward( layer_outputs = self._gradient_checkpointing_func( decoder_layer.__call__, hidden_states, - attention_mask, + causal_mask, position_ids, - None if past_key_values is None else past_key_values[layer_idx], + past_key_values, output_attentions, use_cache, + cache_position, None, attn_softmax_bf16, False, @@ -686,11 +731,12 @@ def forward( else: layer_outputs = decoder_layer( hidden_states, - attention_mask=attention_mask, + attention_mask=causal_mask, position_ids=position_ids, past_key_value=None if past_key_values is None else past_key_values[layer_idx], output_attentions=output_attentions, use_cache=use_cache, + cache_position=cache_position, token_idx=token_idx, attn_softmax_bf16=attn_softmax_bf16, reuse_cache=reuse_cache, @@ -698,7 +744,6 @@ def forward( flash_attention_recompute=flash_attention_recompute, cache_idx=cache_idx, ) - hidden_states = layer_outputs[0] if use_cache: @@ -716,9 +761,7 @@ def forward( next_cache = None if use_cache: next_cache = ( - next_decoder_cache - if not use_new_cache - else (next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache) + next_decoder_cache.to_legacy_cache() if isinstance(next_decoder_cache, Cache) else next_decoder_cache ) if not return_dict: return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) @@ -764,6 +807,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, token_idx: Optional[torch.Tensor] = None, trim_logits: Optional[bool] = False, attn_softmax_bf16: Optional[bool] = False, @@ -788,6 +832,7 @@ def forward( output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, + cache_position=cache_position, token_idx=token_idx, attn_softmax_bf16=attn_softmax_bf16, reuse_cache=reuse_cache, @@ -839,6 +884,8 @@ def forward( def prepare_inputs_for_generation( self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, token_idx=None, **kwargs ): + past_length = 0 + reuse_cache = kwargs.get("reuse_cache") if past_key_values is not None: if token_idx is not None: @@ -886,16 +933,35 @@ def prepare_inputs_for_generation( position_ids = torch.index_select(position_ids, 1, token_idx - 1) else: position_ids = position_ids[:, -input_ids.shape[1] :] - + # TODO: we are using token_idx, disable this for now + # if self.generation_config.cache_implementation == "static": + # generation with static cache + # cache_position = kwargs.get("cache_position", None) + # if cache_position is None: + # past_length = 0 + # else: + # past_length = cache_position[-1] + 1 + # input_ids = input_ids[:, past_length:] + # position_ids = position_ids[:, past_length:] + + # TODO @gante we should only keep a `cache_position` in generate, and do +=1. + # same goes for position ids. Could also help with continued generation. + # cache_position = torch.arange(past_length, past_length + position_ids.shape[-1], device=position_ids.device) + # keep cache_position implementation as None for HPU + cache_position = None # if `inputs_embeds` are passed, we only want to use them in the 1st generation step if inputs_embeds is not None and past_key_values is None: model_inputs = {"inputs_embeds": inputs_embeds} else: - model_inputs = {"input_ids": input_ids} + # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise + # recompiles graphs as the stride of the inputs is a guard. Ref: https://github.com/huggingface/transformers/pull/29114 + # TODO: use `next_tokens` directly instead. + model_inputs = {"input_ids": input_ids.contiguous()} model_inputs.update( { - "position_ids": position_ids, + "position_ids": position_ids.contiguous(), + "cache_position": cache_position, "past_key_values": past_key_values, "use_cache": kwargs.get("use_cache"), "attention_mask": attention_mask, @@ -920,4 +986,4 @@ def apply_customized_rope(q, k, cos, sin, position_ids): k, cos.unsqueeze(0).unsqueeze(0).clone(), sin.unsqueeze(0).unsqueeze(0).clone(), position_ids ) else: - return apply_rotary_pos_emb(q, k, cos, sin, position_ids) + return apply_rotary_pos_emb(q, k, cos, sin) diff --git a/optimum/habana/transformers/models/mistral/modeling_mistral.py b/optimum/habana/transformers/models/mistral/modeling_mistral.py index e2a375db31..a5035b6829 100644 --- a/optimum/habana/transformers/models/mistral/modeling_mistral.py +++ b/optimum/habana/transformers/models/mistral/modeling_mistral.py @@ -397,11 +397,11 @@ def forward( shift_logits = logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens - loss_fct = CrossEntropyLoss() shift_logits = shift_logits.view(-1, self.config.vocab_size) shift_labels = shift_labels.view(-1) - # Enable model parallelism + # Ensure tensors are on the same device shift_labels = shift_labels.to(shift_logits.device) + loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits, shift_labels) if not return_dict: diff --git a/optimum/habana/transformers/models/mixtral/modeling_mixtral.py b/optimum/habana/transformers/models/mixtral/modeling_mixtral.py index 61537cfbe0..1069477183 100644 --- a/optimum/habana/transformers/models/mixtral/modeling_mixtral.py +++ b/optimum/habana/transformers/models/mixtral/modeling_mixtral.py @@ -629,10 +629,13 @@ def forward( aux_loss = None if output_router_logits: aux_loss = load_balancing_loss_func( - outputs.router_logits if return_dict else outputs[-1], self.num_experts, self.num_experts_per_tok + outputs.router_logits if return_dict else outputs[-1], + self.num_experts, + self.num_experts_per_tok, + attention_mask, ) if labels is not None: - loss += self.router_aux_loss_coef * aux_loss + loss += self.router_aux_loss_coef * aux_loss.to(loss.device) # make sure to reside in the same device if not return_dict: output = (logits,) + outputs[1:] diff --git a/optimum/habana/transformers/models/modeling_all_models.py b/optimum/habana/transformers/models/modeling_all_models.py index 5b78e5938a..c95284cafd 100644 --- a/optimum/habana/transformers/models/modeling_all_models.py +++ b/optimum/habana/transformers/models/modeling_all_models.py @@ -18,7 +18,8 @@ from typing import Tuple import torch -from transformers.modeling_utils import ModuleUtilsMixin +from transformers.modeling_utils import ModuleUtilsMixin, PretrainedConfig +from transformers.utils.import_utils import is_torch_sdpa_available def gaudi_invert_attention_mask(self, encoder_attention_mask: torch.Tensor) -> torch.Tensor: @@ -113,6 +114,41 @@ def gaudi_conv1d_forward(self, x): return x +# Adapted from transformers.modeling_utils.PreTrainedModel._check_and_enable_sdpa +@classmethod +def gaudi_check_and_enable_sdpa(cls, config, hard_check_only: bool = False) -> PretrainedConfig: + # This model doesn't support SDPA in Gaudi yet, fallback to original code. + MODELS_ATTN_IMPLEMENTATION_EAGER = ["bart", "gpt_bigcode", "mistral", "mixtral"] + + if config.model_type in MODELS_ATTN_IMPLEMENTATION_EAGER: + config._attn_implementation = "eager" + return config + + # Otherwise, fallback to original implementation + # https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/modeling_utils.py#L1542 + if hard_check_only: + if not cls._supports_sdpa: + raise ValueError( + f"{cls.__name__} does not support an attention implementation through torch.nn.functional.scaled_dot_product_attention yet." + " Please request the support for this architecture: https://github.com/huggingface/transformers/issues/28005. If you believe" + ' this error is a bug, please open an issue in Transformers GitHub repository and load your model with the argument `attn_implementation="eager"` meanwhile. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="eager")`' + ) + if not is_torch_sdpa_available(): + raise ImportError("PyTorch SDPA requirements in Transformers are not met. Please install torch>=2.1.1.") + + if not is_torch_sdpa_available() or not cls._supports_sdpa: + return config + + _is_bettertransformer = getattr(cls, "use_bettertransformer", False) + if _is_bettertransformer: + return config + + if not hard_check_only: + config._attn_implementation = "sdpa" + + return config + + # Splitting DeepSpeed LinearAllReduce to three parts to avoid redundant memory consumption class ScopedLinearAllReduce(torch.nn.Module): def __init__(self, mod, *args, **kwargs): diff --git a/optimum/habana/transformers/trainer.py b/optimum/habana/transformers/trainer.py index 3f81a10b25..ee1427227d 100644 --- a/optimum/habana/transformers/trainer.py +++ b/optimum/habana/transformers/trainer.py @@ -15,6 +15,7 @@ import contextlib import copy +import importlib.metadata import inspect import math import os @@ -34,6 +35,7 @@ from accelerate.data_loader import SeedableRandomSampler from accelerate.utils import DistributedDataParallelKwargs, GradientAccumulationPlugin, save_fsdp_model from huggingface_hub import upload_folder +from packaging import version from torch.utils.data import DataLoader, Dataset, RandomSampler from transformers import Trainer from transformers.data.data_collator import DataCollator @@ -42,6 +44,7 @@ from transformers.integrations.deepspeed import deepspeed_load_checkpoint, is_deepspeed_available from transformers.modeling_utils import PreTrainedModel, load_sharded_checkpoint, unwrap_model from transformers.tokenization_utils_base import PreTrainedTokenizerBase +from transformers.trainer import _get_fsdp_ckpt_kwargs from transformers.trainer_callback import TrainerCallback, TrainerState from transformers.trainer_pt_utils import ( DistributedTensorGatherer, @@ -89,6 +92,7 @@ from optimum.utils import logging from ..accelerate import GaudiAccelerator +from ..accelerate.utils import GaudiDistributedType from ..utils import ( HabanaProfile, get_hpu_memory_stats, @@ -131,7 +135,15 @@ def _is_peft_model(model): - return is_peft_available() and isinstance(model, PeftModel) + if is_peft_available(): + classes_to_check = (PeftModel,) if is_peft_available() else () + # Here we also check if the model is an instance of `PeftMixedModel` introduced in peft>=0.7.0: https://github.com/huggingface/transformers/pull/28321 + if version.parse(importlib.metadata.version("peft")) >= version.parse("0.7.0"): + from peft import PeftMixedModel + + classes_to_check = (*classes_to_check, PeftMixedModel) + return isinstance(model, classes_to_check) + return False logger = logging.get_logger(__name__) @@ -669,6 +681,8 @@ def hpu_deepspeed_checkpointing(function, *checkpoint_args): use_accelerator_prepare = True if model is self.model else False if delay_optimizer_creation: + if use_accelerator_prepare: + self.model = self.accelerator.prepare(self.model) self.create_optimizer_and_scheduler(num_training_steps=max_steps) # prepare using `accelerator` prepare @@ -693,13 +707,14 @@ def hpu_deepspeed_checkpointing(function, *checkpoint_args): if self.is_deepspeed_enabled: self.deepspeed = self.model_wrapped - # deepspeed ckpt loading - if resume_from_checkpoint is not None and self.is_deepspeed_enabled: - deepspeed_load_checkpoint(self.model_wrapped, resume_from_checkpoint) - - # fsdp ckpt loading - if resume_from_checkpoint is not None and self.is_fsdp_enabled: - self._load_from_checkpoint(resume_from_checkpoint, self.model_wrapped) + # ckpt loading + if resume_from_checkpoint is not None: + if self.is_deepspeed_enabled: + deepspeed_load_checkpoint( + self.model_wrapped, resume_from_checkpoint, load_module_strict=not _is_peft_model(self.model) + ) + elif self.is_fsdp_enabled: + self._load_from_checkpoint(resume_from_checkpoint, self.model_wrapped) # Check if saved optimizer or scheduler states exist self._load_optimizer_and_scheduler(resume_from_checkpoint) @@ -801,6 +816,7 @@ def hpu_deepspeed_checkpointing(function, *checkpoint_args): self._globalstep_last_logged = self.state.global_step self._zero_model_grad(model) + _grad_norm: Optional[float] = None self.control = self.callback_handler.on_train_begin(args, self.state, self.control) @@ -956,10 +972,10 @@ def hpu_deepspeed_checkpointing(function, *checkpoint_args): if self.gaudi_config.use_fused_clip_norm and args.use_habana: # TODO: to merge self.accelerator.clip_grad_norm_ when HMP is removed - self.FusedNorm.clip_norm(model.parameters()) + _grad_norm = self.FusedNorm.clip_norm(model.parameters()) else: # Revert to normal clipping otherwise - self.accelerator.clip_grad_norm_( + _grad_norm = self.accelerator.clip_grad_norm_( model.parameters(), args.max_grad_norm, ) @@ -981,7 +997,7 @@ def hpu_deepspeed_checkpointing(function, *checkpoint_args): self.htcore.mark_step() self.control = self.callback_handler.on_step_end(args, self.state, self.control) - self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval) + self._maybe_log_save_evaluate(tr_loss, _grad_norm, model, trial, epoch, ignore_keys_for_eval) else: self.control = self.callback_handler.on_substep_end(args, self.state, self.control) @@ -997,7 +1013,7 @@ def hpu_deepspeed_checkpointing(function, *checkpoint_args): self.control.should_training_stop = True self.control = self.callback_handler.on_epoch_end(args, self.state, self.control) - self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval) + self._maybe_log_save_evaluate(tr_loss, _grad_norm, model, trial, epoch, ignore_keys_for_eval) if self.control.should_training_stop: break @@ -1074,10 +1090,18 @@ def _load_best_model(self): model = self.model # TODO: check if the code below works # if self.is_deepspeed_enabled: - # deepspeed_load_checkpoint(self.model_wrapped, self.state.best_model_checkpoint) + # deepspeed_load_checkpoint( + # self.model_wrapped, + # self.state.best_model_checkpoint, + # load_module_strict=not _is_peft_model(self.model), + # ) # elif self.is_fsdp_enabled: # load_result = load_fsdp_model( - # self.accelerator.state.fsdp_plugin, self.accelerator, model, self.state.best_model_checkpoint + # self.accelerator.state.fsdp_plugin, + # self.accelerator, + # model, + # self.state.best_model_checkpoint, + # **_get_fsdp_ckpt_kwargs(), # ) if ( os.path.exists(best_model_path) @@ -1130,7 +1154,7 @@ def _load_best_model(self): "on multiple nodes, you should activate `--save_on_each_node`." ) - def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for_eval): + def _maybe_log_save_evaluate(self, tr_loss, _grad_norm, model, trial, epoch, ignore_keys_for_eval): if self.args.adjust_throughput: save_start = time.perf_counter() @@ -1143,6 +1167,16 @@ def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for # reset tr_loss to zero tr_loss -= tr_loss logs["loss"] = round(tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged), 4) + + # This grad_norm block was outside of _maybe_log_save_evaluate method causing perf degradataion. + # Moving it here so the grad tensor is only copied when it's needed. + if is_accelerate_available() and self.accelerator.distributed_type == GaudiDistributedType.DEEPSPEED: + grad_norm = model.get_global_grad_norm() + else: + grad_norm = _grad_norm.item() if _grad_norm is not None else None + + if grad_norm is not None: + logs["grad_norm"] = grad_norm logs["learning_rate"] = self._get_learning_rate() self._total_loss_scalar += tr_loss_scalar @@ -1224,7 +1258,7 @@ def _save_checkpoint(self, model, trial, metrics=None): output_dir = os.path.join(run_dir, checkpoint_folder) if os.path.exists(output_dir) and len(os.listdir(output_dir)) > 0: logger.warning( - f"Checkpoint destination directory {output_dir} already exists and is non-empty." + f"Checkpoint destination directory {output_dir} already exists and is non-empty. " "Saving will proceed but saved results may be invalid." ) staging_output_dir = output_dir @@ -1272,13 +1306,21 @@ def _save_checkpoint(self, model, trial, metrics=None): os.rename(staging_output_dir, output_dir) # Ensure rename completed in cases where os.rename is not atomic - fd = os.open(output_dir, os.O_RDONLY) - os.fsync(fd) - os.close(fd) + # And can only happen on non-windows based systems + if os.name != "nt": + fd = os.open(output_dir, os.O_RDONLY) + os.fsync(fd) + os.close(fd) # Maybe delete some older checkpoints. if self.args.should_save: - self._rotate_checkpoints(use_mtime=True, output_dir=run_dir) + # Solely rely on numerical checkpoint id for rotation. + # mtime is not reliable especially on some fuse fs in cloud environments. + self._rotate_checkpoints(use_mtime=False, output_dir=run_dir) + elif self.is_local_process_zero(): + # Clean up the remaining staging checkpoint folders on other nodes + if staging_output_dir != output_dir and os.path.exists(staging_output_dir): + shutil.rmtree(staging_output_dir) self.args.distributed_state.wait_for_everyone() @@ -1319,7 +1361,9 @@ def _save_optimizer_and_scheduler(self, output_dir): self.model_wrapped.save_checkpoint(output_dir) elif self.is_fsdp_enabled: # save fsdp specific ckpt for resuming from ckpt - save_fsdp_model(self.accelerator.state.fsdp_plugin, self.accelerator, self.model, output_dir) + save_fsdp_model( + self.accelerator.state.fsdp_plugin, self.accelerator, self.model, output_dir, **_get_fsdp_ckpt_kwargs() + ) save_fsdp_optimizer( self.accelerator.state.fsdp_plugin, self.accelerator, self.optimizer, self.model, output_dir ) @@ -1383,6 +1427,7 @@ def _load_optimizer_and_scheduler(self, checkpoint): self.optimizer, self.model, checkpoint, + **_get_fsdp_ckpt_kwargs(), ) else: self.optimizer.load_state_dict( @@ -2143,12 +2188,10 @@ def create_accelerator_and_postprocess(self): # create accelerator object self.accelerator = GaudiAccelerator( - dispatch_batches=self.args.dispatch_batches, - split_batches=self.args.split_batches, deepspeed_plugin=self.args.deepspeed_plugin, gradient_accumulation_plugin=gradient_accumulation_plugin, - even_batches=not self.args.dataloader_drop_last, distribution_strategy=self.args.distribution_strategy, + **self.args.accelerator_config.to_dict(), ) # some Trainer classes need to use `gather` instead of `gather_for_metrics`, thus we store a flag self.gather_function = self.accelerator.gather_for_metrics @@ -2179,6 +2222,20 @@ def create_accelerator_and_postprocess(self): if self.is_deepspeed_enabled and getattr(self.args, "hf_deepspeed_config", None) is None: self.propagate_args_to_deepspeed() + # `save_only_model` can't be used with DeepSpeed/FSDP along with `load_best_model_at_end` + if ( + self.args.save_only_model + and (self.is_deepspeed_enabled or self.is_fsdp_enabled) + and self.args.load_best_model_at_end + ): + wrapper = "DeepSpeed" if self.is_deepspeed_enabled else "FSDP" + raise ValueError(f"{wrapper} can't be used with `save_only_model` along with `load_best_model_at_end`.") + + # `auto_find_batch_size` isn't yet supported with DeepSpeed/FSDP + if (self.is_deepspeed_enabled or self.is_fsdp_enabled) and self.args.auto_find_batch_size: + wrapper = "DeepSpeed" if self.is_deepspeed_enabled else "FSDP" + raise NotImplementedError(f"`{wrapper}` doesn't support `auto_find_batch_size`.") + def propagate_args_to_deepspeed(self, auto_find_batch_size=False): """ Sets values in the deepspeed plugin based on the Trainer args diff --git a/optimum/habana/transformers/training_args.py b/optimum/habana/transformers/training_args.py index 5979e00243..d6fca1d67d 100644 --- a/optimum/habana/transformers/training_args.py +++ b/optimum/habana/transformers/training_args.py @@ -25,6 +25,7 @@ from packaging import version from transformers.debug_utils import DebugOption from transformers.file_utils import cached_property, is_torch_available, requires_backends +from transformers.trainer_pt_utils import AcceleratorConfig from transformers.trainer_utils import EvaluationStrategy, FSDPOption, HubStrategy, IntervalStrategy, SchedulerType from transformers.training_args import ( OptimizerNames, @@ -563,6 +564,7 @@ def __post_init__(self): ): raise ValueError("`min_num_params` and `transformer_layer_cls_to_wrap` are mutually exclusive.") self.fsdp_config["xla"] = self.fsdp_config.get("xla", False) + self.fsdp_config["xla_fsdp_v2"] = self.fsdp_config.get("xla_fsdp_v2", False) self.fsdp_config["xla_fsdp_grad_ckpt"] = self.fsdp_config.get("xla_fsdp_grad_ckpt", False) # accelerate integration for FSDP @@ -600,6 +602,33 @@ def __post_init__(self): self.fsdp_config.get("activation_checkpointing", "false") ) + if is_accelerate_available(): + if not isinstance(self.accelerator_config, (AcceleratorConfig)): + if self.accelerator_config is None: + self.accelerator_config = AcceleratorConfig() + elif isinstance(self.accelerator_config, dict): + self.accelerator_config = AcceleratorConfig(**self.accelerator_config) + else: + self.accelerator_config = AcceleratorConfig.from_json_file(self.accelerator_config) + if self.dispatch_batches is not None: + warnings.warn( + "Using `--dispatch_batches` is deprecated and will be removed in version 4.41 of 🤗 Transformers. Use" + " `--accelerator_config {'dispatch_batches':VALUE} instead", + FutureWarning, + ) + self.accelerator_config.dispatch_batches = self.dispatch_batches + + if self.split_batches is not None: + warnings.warn( + "Using `--split_batches` is deprecated and will be removed in version 4.41 of 🤗 Transformers. Use" + " `--accelerator_config {'split_batches':VALUE} instead", + FutureWarning, + ) + self.accelerator_config.split_batches = self.split_batches + + if self.dataloader_drop_last: + self.accelerator_config.even_batches = False + if isinstance(self.debug, str): self.debug = [DebugOption(s) for s in self.debug.split()] elif self.debug is None: @@ -641,6 +670,12 @@ def __post_init__(self): if self.use_cpu: self.dataloader_pin_memory = False + if self.dataloader_num_workers == 0 and self.dataloader_prefetch_factor is not None: + raise ValueError( + "--dataloader_prefetch_factor can only be set when data is loaded in a different process, i.e." + " when --dataloader_num_workers > 1." + ) + if self.push_to_hub_token is not None: warnings.warn( ( diff --git a/optimum/habana/transformers/training_args_seq2seq.py b/optimum/habana/transformers/training_args_seq2seq.py index 8f960b8910..82e02bb491 100644 --- a/optimum/habana/transformers/training_args_seq2seq.py +++ b/optimum/habana/transformers/training_args_seq2seq.py @@ -50,8 +50,7 @@ class GaudiSeq2SeqTrainingArguments(GaudiTrainingArguments): Allows to load a [`transformers.generation.GenerationConfig`] from the `from_pretrained` method. This can be either: - a string, the *model id* of a pretrained model configuration hosted inside a model repo on - huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced - under a user or organization name, like `dbmdz/bert-base-german-cased`. + huggingface.co. - a path to a *directory* containing a configuration file saved using the [`transformers.GenerationConfig.save_pretrained`] method, e.g., `./my_model_directory/`. - a [`transformers.generation.GenerationConfig`] object. diff --git a/setup.py b/setup.py index ac835c97a2..5293e9b30f 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ INSTALL_REQUIRES = [ - "transformers >= 4.37.0, < 4.38.0", + "transformers >= 4.38.0, < 4.39.0", "optimum", "torch", "accelerate < 0.28.0", diff --git a/tests/example_diff/run_audio_classification.txt b/tests/example_diff/run_audio_classification.txt index 5201c92c59..8cd2f11e5b 100644 --- a/tests/example_diff/run_audio_classification.txt +++ b/tests/example_diff/run_audio_classification.txt @@ -31,7 +31,7 @@ < check_min_version("4.39.0.dev0") --- > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. -> check_min_version("4.37.0") +> check_min_version("4.38.0") > check_optimum_habana_min_version("1.10.0") 180,182d182 < freeze_feature_extractor: Optional[bool] = field( diff --git a/tests/example_diff/run_clip.txt b/tests/example_diff/run_clip.txt index 38eb55eb2a..22100f2315 100644 --- a/tests/example_diff/run_clip.txt +++ b/tests/example_diff/run_clip.txt @@ -28,7 +28,7 @@ < check_min_version("4.39.0.dev0") --- > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. -> check_min_version("4.37.0") +> check_min_version("4.38.0") > check_optimum_habana_min_version("1.10.0") 188a197,199 > mediapipe_dataloader: bool = field( diff --git a/tests/example_diff/run_clm.txt b/tests/example_diff/run_clm.txt index 4f957539c6..96b90da5c2 100644 --- a/tests/example_diff/run_clm.txt +++ b/tests/example_diff/run_clm.txt @@ -18,7 +18,7 @@ < Trainer, < TrainingArguments, 49,50d44 -< is_torch_tpu_available, +< is_torch_xla_available, < set_seed, 56a51,52 > from optimum.habana import GaudiConfig, GaudiTrainer, GaudiTrainingArguments @@ -38,7 +38,7 @@ > 64a65,70 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. -> check_min_version("4.37.0") +> check_min_version("4.38.0") > check_optimum_habana_min_version("1.10.0") > > require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") @@ -103,9 +103,9 @@ 598a630 > gaudi_config=gaudi_config, 605,608c637,638 -< compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None, +< compute_metrics=compute_metrics if training_args.do_eval and not is_torch_xla_available() else None, < preprocess_logits_for_metrics=preprocess_logits_for_metrics -< if training_args.do_eval and not is_torch_tpu_available() +< if training_args.do_eval and not is_torch_xla_available() < else None, --- > compute_metrics=compute_metrics if training_args.do_eval else None, diff --git a/tests/example_diff/run_glue.txt b/tests/example_diff/run_glue.txt index 78677e0e8e..ff1630582f 100644 --- a/tests/example_diff/run_glue.txt +++ b/tests/example_diff/run_glue.txt @@ -31,7 +31,7 @@ < check_min_version("4.39.0.dev0") --- > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. -> check_min_version("4.37.0") +> check_min_version("4.38.0") > check_optimum_habana_min_version("1.10.0") 68,69d77 < logger = logging.getLogger(__name__) diff --git a/tests/example_diff/run_image_classification.txt b/tests/example_diff/run_image_classification.txt index c0812988de..220871b28e 100644 --- a/tests/example_diff/run_image_classification.txt +++ b/tests/example_diff/run_image_classification.txt @@ -28,7 +28,7 @@ < check_min_version("4.39.0.dev0") --- > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. -> check_min_version("4.37.0") +> check_min_version("4.38.0") > check_optimum_habana_min_version("1.10.0") 191c199 < parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) diff --git a/tests/example_diff/run_mlm.txt b/tests/example_diff/run_mlm.txt index fb9f239bfa..65df71a802 100644 --- a/tests/example_diff/run_mlm.txt +++ b/tests/example_diff/run_mlm.txt @@ -13,7 +13,7 @@ 46,49d43 < Trainer, < TrainingArguments, -< is_torch_tpu_available, +< is_torch_xla_available, < set_seed, 54a49,50 > from optimum.habana import GaudiConfig, GaudiTrainer, GaudiTrainingArguments @@ -34,7 +34,7 @@ 61a62,69 > > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. -> check_min_version("4.37.0") +> check_min_version("4.38.0") > check_optimum_habana_min_version("1.10.0") > > require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") @@ -78,9 +78,9 @@ 617a634 > gaudi_config=gaudi_config, 623,626c640,641 -< compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None, +< compute_metrics=compute_metrics if training_args.do_eval and not is_torch_xla_available() else None, < preprocess_logits_for_metrics=preprocess_logits_for_metrics -< if training_args.do_eval and not is_torch_tpu_available() +< if training_args.do_eval and not is_torch_xla_available() < else None, --- > compute_metrics=compute_metrics if training_args.do_eval else None, diff --git a/tests/example_diff/run_qa.txt b/tests/example_diff/run_qa.txt index b2ad22cac6..5f6d798d14 100644 --- a/tests/example_diff/run_qa.txt +++ b/tests/example_diff/run_qa.txt @@ -32,7 +32,7 @@ > 58a62,67 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. -> check_min_version("4.37.0") +> check_min_version("4.38.0") > check_optimum_habana_min_version("1.10.0") > > require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/tests/example_diff/run_seq2seq_qa.txt b/tests/example_diff/run_seq2seq_qa.txt index 3a434b2afb..7a2696db12 100644 --- a/tests/example_diff/run_seq2seq_qa.txt +++ b/tests/example_diff/run_seq2seq_qa.txt @@ -24,7 +24,7 @@ > 55a59,64 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. -> check_min_version("4.37.0") +> check_min_version("4.38.0") > check_optimum_habana_min_version("1.10.0") > > require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/tests/example_diff/run_speech_recognition_ctc.txt b/tests/example_diff/run_speech_recognition_ctc.txt index 2ece9be9c5..332e555d4a 100644 --- a/tests/example_diff/run_speech_recognition_ctc.txt +++ b/tests/example_diff/run_speech_recognition_ctc.txt @@ -29,7 +29,7 @@ > return () 60a62,67 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. -> check_min_version("4.37.0") +> check_min_version("4.38.0") > check_optimum_habana_min_version("1.10.0") > > require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/tests/example_diff/run_speech_recognition_seq2seq.txt b/tests/example_diff/run_speech_recognition_seq2seq.txt index fa6647d9d0..3d75555530 100644 --- a/tests/example_diff/run_speech_recognition_seq2seq.txt +++ b/tests/example_diff/run_speech_recognition_seq2seq.txt @@ -22,7 +22,7 @@ 52c59,60 < check_min_version("4.39.0.dev0") --- -> check_min_version("4.37.0") +> check_min_version("4.38.0") > check_optimum_habana_min_version("1.10.0") 237a246,249 > label_features_max_length: int = field( @@ -60,16 +60,20 @@ > f"Process rank: {training_args.local_rank}, device: {training_args.device}, " > + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, " > + f"mixed-precision training: {mixed_precision}" -449a476,479 +445a472,474 +> model.generation_config.task = data_args.task +> model.generation_config.language = data_args.language +> model.generation_config.forced_decoder_ids = None +449a479,482 > logger.warning( > f"The dataset sampling rate ({dataset_sampling_rate}) is different from the feature extractor one" > f" ({feature_extractor.sampling_rate}).Data resampling should be done." > ) -554a585 +554a588 > label_features_max_length=data_args.label_features_max_length, -558c589 +558c592 < trainer = Seq2SeqTrainer( --- > trainer = GaudiSeq2SeqTrainer( -559a591 +559a594 > gaudi_config=gaudi_config, diff --git a/tests/example_diff/run_summarization.txt b/tests/example_diff/run_summarization.txt index c3b9dcef30..ac91cab6b1 100644 --- a/tests/example_diff/run_summarization.txt +++ b/tests/example_diff/run_summarization.txt @@ -36,7 +36,7 @@ > 61a68,73 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. -> check_min_version("4.37.0") +> check_min_version("4.38.0") > check_optimum_habana_min_version("1.10.0") > > require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/tests/example_diff/run_translation.txt b/tests/example_diff/run_translation.txt index ca6ca78e9d..958dfc5b30 100644 --- a/tests/example_diff/run_translation.txt +++ b/tests/example_diff/run_translation.txt @@ -28,7 +28,7 @@ > 61a65,70 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. -> check_min_version("4.37.0") +> check_min_version("4.38.0") > check_optimum_habana_min_version("1.10.0") > > require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") diff --git a/tests/test_trainer.py b/tests/test_trainer.py index 76bbf78b67..1963f805c7 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -27,7 +27,7 @@ from typing import Dict, List, Optional, Union import numpy as np -from huggingface_hub import HfFolder, delete_repo, list_repo_commits, list_repo_files +from huggingface_hub import HfFolder, ModelCard, delete_repo, list_repo_commits, list_repo_files from parameterized import parameterized from pytest import mark from requests.exceptions import HTTPError @@ -44,6 +44,7 @@ TOKEN, USER, CaptureLogger, + LoggingLevel, TestCasePlus, get_gpu_count, get_tests_dir, @@ -56,6 +57,7 @@ require_torch, ) from transformers.tokenization_utils_base import PreTrainedTokenizerBase +from transformers.trainer_pt_utils import AcceleratorConfig from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, HPSearchBackend, get_last_checkpoint from transformers.training_args import OptimizerNames from transformers.utils import ( @@ -158,8 +160,8 @@ def __init__(self, length=64, seed=42, batch_size=8): np.random.seed(seed) sizes = np.random.randint(1, 20, (length // batch_size,)) # For easy batching, we make every batch_size consecutive samples the same size. - self.xs = [np.random.normal(size=(s,)) for s in sizes.repeat(batch_size)] - self.ys = [np.random.normal(size=(s,)) for s in sizes.repeat(batch_size)] + self.xs = [np.random.normal(size=(s,)).astype(np.float32) for s in sizes.repeat(batch_size)] + self.ys = [np.random.normal(size=(s,)).astype(np.float32) for s in sizes.repeat(batch_size)] def __len__(self): return self.length @@ -547,7 +549,7 @@ def test_trainer_with_datasets(self): np.random.seed(42) x = np.random.normal(size=(64,)).astype(np.float32) - y = 2.0 * x + 3.0 + np.random.normal(scale=0.1, size=(64,)) + y = 2.0 * x + 3.0 + np.random.normal(scale=0.1, size=(64,)).astype(np.float32) train_dataset = datasets.Dataset.from_dict({"input_x": x, "label": y}) gaudi_config = get_gaudi_config() @@ -1214,17 +1216,19 @@ def test_log_level(self): else: self.assertNotIn(log_info_string, cl.out) - # test with low log_level - lower than info - with CaptureLogger(logger) as cl: - trainer = get_regression_trainer(log_level="debug") - trainer.train() - self.assertIn(log_info_string, cl.out) + with LoggingLevel(logging.INFO): + # test with low log_level - lower than info + with CaptureLogger(logger) as cl: + trainer = get_regression_trainer(log_level="debug") + trainer.train() + self.assertIn(log_info_string, cl.out) - # test with high log_level - should be quiet - with CaptureLogger(logger) as cl: - trainer = get_regression_trainer(log_level="error") - trainer.train() - self.assertNotIn(log_info_string, cl.out) + with LoggingLevel(logging.INFO): + # test with high log_level - should be quiet + with CaptureLogger(logger) as cl: + trainer = get_regression_trainer(log_level="error") + trainer.train() + self.assertNotIn(log_info_string, cl.out) def test_save_checkpoints(self): with tempfile.TemporaryDirectory() as tmpdir: @@ -1896,6 +1900,172 @@ def test_no_wd_param_group(self): self.assertListEqual(trainer.optimizer.param_groups[0]["params"], wd_params) self.assertListEqual(trainer.optimizer.param_groups[1]["params"], no_wd_params) + def test_accelerator_config_empty(self): + # Checks that a config can be made with the defaults if not passed + with tempfile.TemporaryDirectory() as tmp_dir: + config = RegressionModelConfig(a=1.5, b=2.5) + model = RegressionPreTrainedModel(config) + eval_dataset = SampleIterableDataset() + + # Leaves one option as something *not* basic + gaudi_config = get_gaudi_config() + args = RegressionGaudiTrainingArguments(output_dir=tmp_dir, use_habana=True) + trainer = GaudiTrainer(model=model, gaudi_config=gaudi_config, args=args, eval_dataset=eval_dataset) + self.assertEqual(trainer.accelerator.split_batches, False) + self.assertEqual(trainer.accelerator.dispatch_batches, None) + self.assertEqual(trainer.accelerator.even_batches, True) + self.assertEqual(trainer.accelerator.use_seedable_sampler, True) + + def test_accelerator_config_from_dict(self): + # Checks that accelerator kwargs can be passed through + # and the accelerator is initialized respectively + with tempfile.TemporaryDirectory() as tmp_dir: + config = RegressionModelConfig(a=1.5, b=2.5) + model = RegressionPreTrainedModel(config) + eval_dataset = SampleIterableDataset() + + # Leaves all options as something *not* basic + gaudi_config = get_gaudi_config() + args = RegressionGaudiTrainingArguments( + output_dir=tmp_dir, + accelerator_config={ + "split_batches": True, + "dispatch_batches": True, + "even_batches": False, + "use_seedable_sampler": True, + }, + use_habana=True, + ) + trainer = GaudiTrainer(model=model, gaudi_config=gaudi_config, args=args, eval_dataset=eval_dataset) + self.assertEqual(trainer.accelerator.split_batches, True) + self.assertEqual(trainer.accelerator.dispatch_batches, True) + self.assertEqual(trainer.accelerator.even_batches, False) + self.assertEqual(trainer.accelerator.use_seedable_sampler, True) + + def test_accelerator_config_from_yaml(self): + # Checks that accelerator kwargs can be passed through + # and the accelerator is initialized respectively + with tempfile.TemporaryDirectory() as tmp_dir: + path_file = Path(tmp_dir) / "accelerator_config.json" + with open(path_file, "w") as f: + accelerator_config = { + "split_batches": True, + "dispatch_batches": True, + "even_batches": False, + "use_seedable_sampler": False, + } + json.dump(accelerator_config, f) + config = RegressionModelConfig(a=1.5, b=2.5) + model = RegressionPreTrainedModel(config) + eval_dataset = SampleIterableDataset() + + # Leaves all options as something *not* basic + gaudi_config = get_gaudi_config() + args = RegressionGaudiTrainingArguments(output_dir=tmp_dir, accelerator_config=path_file, use_habana=True) + trainer = GaudiTrainer(model=model, gaudi_config=gaudi_config, args=args, eval_dataset=eval_dataset) + self.assertEqual(trainer.accelerator.split_batches, True) + self.assertEqual(trainer.accelerator.dispatch_batches, True) + self.assertEqual(trainer.accelerator.even_batches, False) + self.assertEqual(trainer.accelerator.use_seedable_sampler, False) + + def test_accelerator_config_from_dataclass(self): + # Checks that accelerator kwargs can be passed through + # and the accelerator is initialized respectively + accelerator_config = AcceleratorConfig( + split_batches=True, dispatch_batches=True, even_batches=False, use_seedable_sampler=False + ) + config = RegressionModelConfig(a=1.5, b=2.5) + model = RegressionPreTrainedModel(config) + eval_dataset = SampleIterableDataset() + with tempfile.TemporaryDirectory() as tmp_dir: + gaudi_config = get_gaudi_config() + args = RegressionGaudiTrainingArguments( + output_dir=tmp_dir, accelerator_config=accelerator_config, use_habana=True + ) + trainer = GaudiTrainer(model=model, gaudi_config=gaudi_config, args=args, eval_dataset=eval_dataset) + self.assertEqual(trainer.accelerator.split_batches, True) + self.assertEqual(trainer.accelerator.dispatch_batches, True) + self.assertEqual(trainer.accelerator.even_batches, False) + self.assertEqual(trainer.accelerator.use_seedable_sampler, False) + + def test_accelerator_config_from_partial(self): + # Checks that accelerator kwargs can be passed through + # and the accelerator is initialized respectively + with tempfile.TemporaryDirectory() as tmp_dir: + config = RegressionModelConfig(a=1.5, b=2.5) + model = RegressionPreTrainedModel(config) + eval_dataset = SampleIterableDataset() + + # Leaves one option as something *not* basic + gaudi_config = get_gaudi_config() + args = RegressionGaudiTrainingArguments( + output_dir=tmp_dir, + accelerator_config={ + "split_batches": True, + }, + use_habana=True, + ) + trainer = GaudiTrainer(model=model, gaudi_config=gaudi_config, args=args, eval_dataset=eval_dataset) + self.assertEqual(trainer.accelerator.split_batches, True) + self.assertEqual(trainer.accelerator.dispatch_batches, None) + self.assertEqual(trainer.accelerator.even_batches, True) + self.assertEqual(trainer.accelerator.use_seedable_sampler, True) + + def test_accelerator_config_from_dict_with_deprecated_args(self): + # Checks that accelerator kwargs can be passed through + # and the accelerator is initialized respectively + # and maintains the deprecated args if passed in + with tempfile.TemporaryDirectory() as tmp_dir: + config = RegressionModelConfig(a=1.5, b=2.5) + model = RegressionPreTrainedModel(config) + eval_dataset = SampleIterableDataset() + + # Leaves all options as something *not* basic + with self.assertWarns(FutureWarning) as cm: + gaudi_config = get_gaudi_config() + args = RegressionGaudiTrainingArguments( + output_dir=tmp_dir, + accelerator_config={ + "split_batches": True, + }, + dispatch_batches=False, + use_habana=True, + ) + self.assertIn("dispatch_batches", str(cm.warnings[0].message)) + trainer = GaudiTrainer(model=model, gaudi_config=gaudi_config, args=args, eval_dataset=eval_dataset) + self.assertEqual(trainer.accelerator.dispatch_batches, False) + self.assertEqual(trainer.accelerator.split_batches, True) + with self.assertWarns(FutureWarning) as cm: + args = RegressionGaudiTrainingArguments( + output_dir=tmp_dir, + accelerator_config={ + "even_batches": False, + }, + split_batches=True, + use_habana=True, + ) + self.assertIn("split_batches", str(cm.warnings[0].message)) + trainer = GaudiTrainer(model=model, gaudi_config=gaudi_config, args=args, eval_dataset=eval_dataset) + self.assertEqual(trainer.accelerator.split_batches, True) + self.assertEqual(trainer.accelerator.even_batches, False) + self.assertEqual(trainer.accelerator.dispatch_batches, None) + + def test_accelerator_config_only_deprecated_args(self): + with tempfile.TemporaryDirectory() as tmp_dir: + with self.assertWarns(FutureWarning) as cm: + gaudi_config = get_gaudi_config() + args = RegressionGaudiTrainingArguments( + output_dir=tmp_dir, + split_batches=True, + use_habana=True, + ) + self.assertIn("split_batches", str(cm.warnings[0].message)) + config = RegressionModelConfig(a=1.5, b=2.5) + model = RegressionPreTrainedModel(config) + eval_dataset = SampleIterableDataset() + trainer = GaudiTrainer(model=model, gaudi_config=gaudi_config, args=args, eval_dataset=eval_dataset) + self.assertEqual(trainer.accelerator.split_batches, True) + def test_profiling(self): # 24 total steps and compilation takes place during the 1st three steps trainer = get_regression_trainer(profiling_warmup_steps=3, profiling_steps=21) @@ -1912,7 +2082,13 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): - for model in ["test-trainer", "test-trainer-epoch", "test-trainer-step", "test-trainer-tensorboard"]: + for model in [ + "test-trainer", + "test-trainer-epoch", + "test-trainer-step", + "test-trainer-tensorboard", + "test-trainer-tags", + ]: try: delete_repo(token=cls._token, repo_id=model) except HTTPError: @@ -2043,6 +2219,31 @@ def test_push_to_hub_with_tensorboard_logs(self): assert found_log is True, "No tensorboard log found in repo" + def test_push_to_hub_tags(self): + # Checks if `trainer.push_to_hub()` works correctly by adding the desired + # tag without having to pass `tags` in `push_to_hub` + # see: + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = get_regression_trainer( + output_dir=os.path.join(tmp_dir, "test-trainer-tags"), + push_to_hub=True, + hub_token=self._token, + ) + + trainer.model.add_model_tags(["test-trainer-tags"]) + + url = trainer.push_to_hub() + + # Extract repo_name from the url + re_search = re.search(ENDPOINT_STAGING + r"/([^/]+/[^/]+)/", url) + self.assertTrue(re_search is not None) + repo_name = re_search.groups()[0] + + self.assertEqual(repo_name, f"{USER}/test-trainer-tags") + + model_card = ModelCard.load(repo_name) + self.assertTrue("test-trainer-tags" in model_card.data.tags) + @require_torch @require_optuna diff --git a/tests/test_trainer_seq2seq.py b/tests/test_trainer_seq2seq.py index d4a9f97e3d..816fb47138 100644 --- a/tests/test_trainer_seq2seq.py +++ b/tests/test_trainer_seq2seq.py @@ -49,7 +49,7 @@ def test_finetune_t5(self): ) model = T5ForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-t5-v1.1") - tokenizer = AutoTokenizer.from_pretrained("t5-small") + tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small") model.config.max_length = 128 diff --git a/tests/transformers/tests/models/llama/test_modeling_llama.py b/tests/transformers/tests/models/llama/test_modeling_llama.py index 6b76834a50..2c505b6811 100644 --- a/tests/transformers/tests/models/llama/test_modeling_llama.py +++ b/tests/transformers/tests/models/llama/test_modeling_llama.py @@ -17,10 +17,11 @@ import unittest from parameterized import parameterized -from transformers import LlamaConfig, is_torch_available, set_seed +from transformers import LlamaConfig, is_torch_available from transformers.testing_utils import require_torch, slow from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi +from optimum.habana.utils import set_seed from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester