diff --git a/examples/text-generation/README.md b/examples/text-generation/README.md index 83a481970c..e14256792a 100644 --- a/examples/text-generation/README.md +++ b/examples/text-generation/README.md @@ -241,7 +241,7 @@ While `--bucket_size` works for any model without model file changes, an even mo ### Running with FP8 -Llama2-70b, Llama2-7b and Mixtral-8x7B in FP8 are enabled using the Quantization Toolkit (HQT), which provides model measurement and quantization capabilities in PyTorch. +Llama2-70b, Llama2-7b, Mixtral-8x7B, Falcon-7B, Falcon-40B, and Falcon-180B in FP8 are enabled using the Quantization Toolkit (HQT), which provides model measurement and quantization capabilities in PyTorch. More information on enabling fp8 in SynapseAI is available here: https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html @@ -321,6 +321,38 @@ QUANT_CONFIG=./quantization_config/maxabs_quant_mixtral.json python run_generati --bf16 \ --fp8 ``` + +Here is an example to measure the tensor quantization statistics on Falcon-180B with 8 cards: +> Please note that Falcon-180B is a gated model, and users are required to request access to it. Please refer to the instructions provided in the StarCoder example above. +```bash +QUANT_CONFIG=./quantization_config/maxabs_measure_include_outputs.json python ../gaudi_spawn.py \ +--use_deepspeed --world_size 8 run_lm_eval.py \ +-o acc_falcon180b_bs1_quant.txt \ +--model_name_or_path tiiuae/falcon-180B \ +--use_hpu_graphs \ +--use_kv_cache \ +--trim_logits \ +--batch_size 1 \ +--bf16 \ +--reuse_cache +``` + +Here is an example to quantize the model based on previous measurements for Falcon-180B with 8 cards: +```bash +QUANT_CONFIG=./quantization_config/maxabs_quant.json python ../gaudi_spawn.py \ +--use_deepspeed --world_size 8 run_generation.py \ +--model_name_or_path tiiuae/falcon-180B \ +--use_hpu_graphs \ +--use_kv_cache \ +--limit_hpu_graphs \ +--max_input_tokens 128 \ +--max_new_tokens 2048 \ +--batch_size 110 \ +--bf16 \ +--reuse_cache \ +--trim_logits \ +--fp8 +``` `--fp8` is required to enable quantization in fp8. diff --git a/examples/text-generation/run_lm_eval.py b/examples/text-generation/run_lm_eval.py index 4ae8dcb26c..554490fd57 100644 --- a/examples/text-generation/run_lm_eval.py +++ b/examples/text-generation/run_lm_eval.py @@ -75,10 +75,15 @@ def __init__(self, tokenizer, model, args, options): self.options = options self._device = args.device self.model_inputs = {"use_cache": self.options.use_cache} - if self.model.config.model_type == "llama": + if self.model.config.model_type == "llama" or "falcon": self.model_inputs.update( { "reuse_cache": self.options.reuse_cache, + } + ) + if self.model.config.model_type == "llama": + self.model_inputs.update( + { "attn_softmax_bf16": self.options.attn_softmax_bf16, } ) diff --git a/examples/text-generation/utils.py b/examples/text-generation/utils.py index e8c847c2f7..f5e21ae532 100644 --- a/examples/text-generation/utils.py +++ b/examples/text-generation/utils.py @@ -237,7 +237,7 @@ def setup_distributed_model(args, model_dtype, model_kwargs, logger): model = deepspeed.init_inference(model, **ds_inference_kwargs) model = model.module - if model.config.model_type == "llama": + if model.config.model_type in ["llama", "falcon"]: patch_scoped_linear_all_reduce(model) if args.quant_config: diff --git a/optimum/habana/transformers/generation/utils.py b/optimum/habana/transformers/generation/utils.py index aa7d92ebce..9a249f8f35 100755 --- a/optimum/habana/transformers/generation/utils.py +++ b/optimum/habana/transformers/generation/utils.py @@ -584,7 +584,8 @@ def generate( assert self.config.model_type in [ "llama", "mistral", - ], "reuse_cache only supported by llama and mistral at the moment" + "falcon", + ], "reuse_cache only supported by llama, mistral and falcon at the moment" if not generation_config.bucket_internal: assert ( generation_config.bucket_size <= 0 @@ -740,7 +741,7 @@ def generate( ) model_kwargs["kv_cache_len"] = calculated_max_length - if self.config.model_type in ["llama"]: + if self.config.model_type in ["llama", "falcon"]: if self.config.max_position_embeddings < calculated_max_length: unwrap_deepspeed_model(self).update_sincos_cache(seq_len=calculated_max_length) diff --git a/optimum/habana/transformers/modeling_utils.py b/optimum/habana/transformers/modeling_utils.py index 9d4e473aab..f7f88cc690 100644 --- a/optimum/habana/transformers/modeling_utils.py +++ b/optimum/habana/transformers/modeling_utils.py @@ -26,7 +26,10 @@ GaudiBloomMLP, GaudiCodeGenAttention, GaudiCodeGenForCausalLM, + GaudiFalconAttention, + GaudiFalconDecoderLayer, GaudiFalconForCausalLM, + GaudiFalconMLP, GaudiFalconModel, GaudiGPT2Attention, GaudiGPT2LMHeadModel, @@ -84,9 +87,7 @@ gaudi_conv1d_forward, gaudi_esm_for_protein_folding_forward, gaudi_esmfolding_trunk_forward, - gaudi_falcon_attention_forward, gaudi_falcon_attention_split_heads, - gaudi_falcon_decoder_layer_forward, gaudi_generate_speech, gaudi_get_extended_attention_mask, gaudi_gpt2_block_forward, @@ -298,10 +299,11 @@ def adapt_transformers_to_gaudi(): transformers.models.llama.modeling_llama.LlamaRMSNorm.forward = gaudi_llama_rmsnorm_forward # Optimization for falcon generation on Gaudi + transformers.models.falcon.modeling_falcon.FalconAttention = GaudiFalconAttention transformers.models.falcon.modeling_falcon.FalconForCausalLM = GaudiFalconForCausalLM + transformers.models.falcon.modeling_falcon.FalconMLP = GaudiFalconMLP transformers.models.falcon.modeling_falcon.FalconModel = GaudiFalconModel - transformers.models.falcon.modeling_falcon.FalconDecoderLayer.forward = gaudi_falcon_decoder_layer_forward - transformers.models.falcon.modeling_falcon.FalconAttention.forward = gaudi_falcon_attention_forward + transformers.models.falcon.modeling_falcon.FalconDecoderLayer = GaudiFalconDecoderLayer transformers.models.falcon.modeling_falcon.FalconAttention._split_heads = gaudi_falcon_attention_split_heads # Optimization for t5 on Gaudi diff --git a/optimum/habana/transformers/models/__init__.py b/optimum/habana/transformers/models/__init__.py index d0eb8b2dcd..481ddae49d 100644 --- a/optimum/habana/transformers/models/__init__.py +++ b/optimum/habana/transformers/models/__init__.py @@ -43,11 +43,12 @@ gaudi_rot_vec_mul, ) from .falcon import ( + GaudiFalconAttention, + GaudiFalconDecoderLayer, GaudiFalconForCausalLM, + GaudiFalconMLP, GaudiFalconModel, - gaudi_falcon_attention_forward, gaudi_falcon_attention_split_heads, - gaudi_falcon_decoder_layer_forward, ) from .gpt2 import GaudiGPT2Attention, GaudiGPT2LMHeadModel, gaudi_gpt2_block_forward, gaudi_gpt2_forward from .gpt_bigcode import ( diff --git a/optimum/habana/transformers/models/falcon/__init__.py b/optimum/habana/transformers/models/falcon/__init__.py index 44ac5451f6..00c73ad110 100644 --- a/optimum/habana/transformers/models/falcon/__init__.py +++ b/optimum/habana/transformers/models/falcon/__init__.py @@ -1,7 +1,8 @@ from .modeling_falcon import ( + GaudiFalconAttention, + GaudiFalconDecoderLayer, GaudiFalconForCausalLM, + GaudiFalconMLP, GaudiFalconModel, - gaudi_falcon_attention_forward, gaudi_falcon_attention_split_heads, - gaudi_falcon_decoder_layer_forward, ) diff --git a/optimum/habana/transformers/models/falcon/modeling_falcon.py b/optimum/habana/transformers/models/falcon/modeling_falcon.py index 98e3555e95..de69fa2956 100644 --- a/optimum/habana/transformers/models/falcon/modeling_falcon.py +++ b/optimum/habana/transformers/models/falcon/modeling_falcon.py @@ -1,5 +1,6 @@ import contextlib import math +import os import warnings from typing import Optional, Tuple, Union @@ -27,6 +28,7 @@ import habana_frameworks.torch.core as htcore +from torch import nn from torch.nn import CrossEntropyLoss from torch.nn import functional as F from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask_for_sdpa @@ -34,12 +36,15 @@ BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions, ) +from transformers.models.falcon.configuration_falcon import FalconConfig from transformers.models.falcon.modeling_falcon import ( + FalconAttention, + FalconDecoderLayer, FalconForCausalLM, + FalconMLP, FalconModel, apply_rotary_pos_emb, build_alibi_tensor, - dropout_add, ) from transformers.utils import logging @@ -52,6 +57,20 @@ logger = logging.get_logger(__name__) +def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training: bool) -> torch.Tensor: + """ + Copied from transformers.models.falcon.modeling_falcon/dropout_add + https://github.com/huggingface/transformers/blob/b338a6c3b8eda29610d4d472cad8cd87cbfdaaed/src/transformers/models/falcon/modeling_falcon.py#L248 + """ + out = F.dropout(x, p=prob, training=training) + if training: + out = residual + out + return out + else: + residual.add_(out) + return residual + + def apply_customized_rope(q, k, cos, sin, position_ids): if q.device.type == "hpu" and FusedRoPE: # TODO: remove `.clone()` when it is fixed in SynapseAI @@ -111,257 +130,504 @@ def gaudi_falcon_attention_split_heads( return query, key, value -def gaudi_falcon_attention_forward( - self, - hidden_states: torch.Tensor, - alibi: Optional[torch.Tensor], - attention_mask: torch.Tensor, - position_ids: Optional[torch.LongTensor] = None, - layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, - head_mask: Optional[torch.Tensor] = None, - use_cache: bool = False, - output_attentions: bool = False, - token_idx: Optional[torch.Tensor] = None, - **kwargs, -): +class Softmax(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x, dim=None, invAttnHead=None): + return torch.ops.hpu.softmax_fp8(x, dim, None, None, invAttnHead) + + +class Matmul(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, *args, **kwargs): + return torch.matmul(*args, **kwargs) + + +# ScaledDotProductAttention is based on torch.nn.functional.scaled_dot_product_attention +class ScaledDotProductAttention(nn.Module): + def __init__(self, config: FalconConfig): + super().__init__() + self.head_dim = config.hidden_size // config.num_attention_heads + self.bmm1 = Matmul() + self.bmm2 = Matmul() + self.softmax = Softmax() + + def forward(self, query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None) -> torch.Tensor: + L, S = query.size(-2), key.size(-2) + scale_factor = 1 / math.sqrt(self.head_dim) + invAttnHead = torch.tensor(scale_factor, dtype=torch.float32).to("hpu") + + if is_causal: + assert attn_mask is None + attn_bias = torch.zeros(L, S, dtype=query.dtype) + temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0) + attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf")) + attn_bias.to(query.dtype) + + if attn_mask is not None: + if attn_mask.dtype == torch.bool: + attn_mask.masked_fill_(attn_mask.logical_not(), float("-inf")) + + attn_weight = self.bmm1(query, key.transpose(-2, -1)) + + attn_weight += attn_mask + attn_weight = self.softmax(attn_weight, dim=-1, invAttnHead=invAttnHead) + attn_weight = torch.dropout(attn_weight, dropout_p, train=True) + return self.bmm2(attn_weight, value) + + +def update(prev, cur, dim, idx, inp_seq_len): + orig_cur = cur + cur = cur.to(dtype=prev.dtype) + + if prev.shape == cur.shape: + prev.copy_(cur) + return orig_cur + + if cur.shape[-2] > 1 and cur.shape[-2] <= prev.shape[-2]: + # Initialize + prev[:, :, :inp_seq_len, :].copy_(cur) + return orig_cur + assert cur.shape[2] == 1, f"Cannot update kv-cache. Unsupported shapes. prev:{prev.shape} cur:{cur.shape}" + if idx is not None: + prev.index_copy_(dim, idx - 1, cur) + prev_cast = prev.to(orig_cur.dtype) + return prev_cast + else: + return torch.cat((prev, cur), dim=dim) + + +class KVCache(torch.nn.Module): + def __init__(self): + super(KVCache, self).__init__() + self.cache = None + self.inp_seq_len = -1 + + def allocate(self, inp_seq_len, dtype, device, shape): + if self.cache is None or self.cache.shape != shape: + self.inp_seq_len = inp_seq_len + self.cache = torch.zeros(shape, dtype=dtype, device=device) + else: + assert ( + self.inp_seq_len == inp_seq_len + ), f"inp_seq_len must be the same. self.inp_seq_len:{self.inp_seq_len} inp_seq_len:{inp_seq_len}" + self.cache.fill_(0) + + def get_shape(self): + if self.cache is None: + return None + return self.cache.shape + + def forward(self, cur, dim, idx): + return self.update(self.cache, cur, dim, idx, self.inp_seq_len) + + def update(self, prev, cur, dim, idx, inp_seq_len): + return update(prev, cur, dim, idx, inp_seq_len) + + +class GaudiFalconAttention(FalconAttention): """ - Copied from FalconAttention.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/falcon/modeling_falcon.py + Inherits from FalconAttention: https://github.com/huggingface/transformers/blob/838b87abe231fd70be5132088d0dee72a7bb8d62/src/transformers/models/falcon/modeling_falcon.py#L267 The only differences are: - add new args token_idx and position_ids - - replace F.scaled_dot_product_attention with Habana torch's version + - replace F.scaled_dot_product_attention with Habana torch's version for BF16 + - use ScaledDotProductAttention for FP8 quantization + - add new arg reuse_cache """ - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - fused_qkv = self.query_key_value(hidden_states) # [batch_size, seq_length, 3 x hidden_size] - # 3 x [batch_size, seq_length, num_heads, head_dim] - (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv) + def __init__(self, config: FalconConfig): + super().__init__(config) - batch_size, query_length, _, _ = query_layer.shape + if os.getenv("QUANT_CONFIG", ""): + self.sdpa = ScaledDotProductAttention(config) - query_layer = query_layer.transpose(1, 2).reshape(batch_size, -1, query_length, self.head_dim) - key_layer = key_layer.transpose(1, 2).reshape(batch_size, -1, query_length, self.head_dim) - value_layer = value_layer.transpose(1, 2).reshape(batch_size, -1, query_length, self.head_dim) + self.k_cache = KVCache() + self.v_cache = KVCache() + self.inp_seq_len = -1 + self.max_position_embeddings = config.max_position_embeddings - kv_seq_len = key_layer.shape[-2] - if layer_past is not None: - if token_idx is not None: - # When token_idx is used, - # past_kv_length = 0 - # static seq len = (input token len + max output token len) - kv_seq_len = layer_past[0].shape[-2] + def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len): + if self.config.new_decoder_architecture: + cache_shape = (batch_size, self.num_heads, max_seq_len, self.head_dim) else: - kv_seq_len += layer_past[0].shape[-2] - if alibi is None: - cos, sin = self.rotary_emb(value_layer, seq_len=kv_seq_len) - query_layer, key_layer = apply_customized_rope(query_layer, key_layer, cos, sin, position_ids) - - if layer_past is not None: - past_key, past_value = layer_past - if token_idx is not None: - past_key.index_copy_(-2, token_idx - 1, key_layer) - past_value.index_copy_(-2, token_idx - 1, value_layer) - key_layer = past_key - value_layer = past_value - else: - # concatenate along seq_length dimension: - # - key: [batch_size, self.num_heads, kv_length, head_dim] - # - value: [batch_size, self.num_heads, kv_length, head_dim] - key_layer = torch.cat((past_key, key_layer), dim=-2) - value_layer = torch.cat((past_value, value_layer), dim=-2) - - kv_length = key_layer.shape[-2] - if use_cache: - present = (key_layer, value_layer) - else: - present = None + cache_shape = (batch_size, 1, max_seq_len, self.head_dim) + device = self.query_key_value.weight.device + dtype = self.config.torch_dtype + self.k_cache.allocate(inp_seq_len, dtype, device, cache_shape) + self.v_cache.allocate(inp_seq_len, dtype, device, cache_shape) + + def update_sincos_cache(self, seq_len): + # Call rotary emb forward() to update cos/sin cache when infering more than self.max_position_embeddings + # This helps in avoiding creation of these caches during actual model forward pass and + # reduce memory consumption and improve performance. + if seq_len > self.max_position_embeddings: + self.max_position_embeddings = seq_len + self.rotary_emb._set_cos_sin_cache( + seq_len, self.query_key_value.weight.device, self.query_key_value.weight.dtype + ) - if alibi is None: - if output_attentions: - attention_scores = query_layer @ key_layer.transpose(-1, -2) - attention_scores /= math.sqrt(self.head_dim) + def pre_attn_forward( + self, + hidden_states: torch.Tensor, + alibi: Optional[torch.Tensor], + attention_mask: torch.Tensor, + position_ids: Optional[torch.LongTensor] = None, + layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + head_mask: Optional[torch.Tensor] = None, + use_cache: bool = False, + output_attentions: bool = False, + token_idx: Optional[torch.Tensor] = None, + reuse_cache: Optional[bool] = False, + cache_idx: int = None, + **kwargs, + ): + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + fused_qkv = self.query_key_value(hidden_states) # [batch_size, seq_length, 3 x hidden_size] + # 3 x [batch_size, seq_length, num_heads, head_dim] + (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv) + + batch_size, query_length, _, _ = query_layer.shape + + query_layer = query_layer.transpose(1, 2).reshape(batch_size, -1, query_length, self.head_dim) + key_layer = key_layer.transpose(1, 2).reshape(batch_size, -1, query_length, self.head_dim) + value_layer = value_layer.transpose(1, 2).reshape(batch_size, -1, query_length, self.head_dim) + + kv_seq_len = key_layer.shape[-2] + if layer_past is not None: + if token_idx is not None: + if reuse_cache: + kv_seq_len = layer_past[0][-2] + else: + kv_seq_len = layer_past[0].shape[-2] + else: + kv_seq_len += layer_past[0].shape[-2] + + if alibi is None: + cos, sin = self.rotary_emb(value_layer, seq_len=kv_seq_len) + query_layer, key_layer = apply_customized_rope(query_layer, key_layer, cos, sin, position_ids) + + if use_cache: + if self.training: + present = None + else: + if reuse_cache: + key_layer = self.k_cache(key_layer, -2, token_idx) + value_layer = self.v_cache(value_layer, -2, token_idx) + present = (self.k_cache.get_shape(), self.v_cache.get_shape()) + else: + if layer_past is None: + past_key = torch.zeros( + key_layer.shape, + dtype=self.query_key_value.weight.dtype, + device=self.query_key_value.weight.device, + ) + past_value = torch.zeros( + key_layer.shape, + dtype=self.query_key_value.weight.dtype, + device=self.query_key_value.weight.device, + ) + layer_past = (past_key, past_value) + key_layer = self.k_cache.update( + layer_past[0], key_layer, -2, token_idx, self.inp_seq_len + ) # k_layer bs*1, q_len, head_dim + value_layer = self.v_cache.update(layer_past[1], value_layer, -2, token_idx, self.inp_seq_len) + present = layer_past + + if cache_idx is not None and query_length == 1: + key_layer = key_layer[:, :, :cache_idx, :] + value_layer = value_layer[:, :, :cache_idx, :] + attention_mask = attention_mask[:, :, :, :cache_idx] + else: + present = None - attention_scores = F.softmax(attention_scores + attention_mask, dim=-1, dtype=hidden_states.dtype) - # It is unclear why neither dropout nor head_mask is applied here (while it is with alibi). - attn_output = attention_scores @ value_layer + if self.training and layer_past is None: + kv_length = key_layer.shape[-2] else: - if FusedSDPA: - with sdp_kernel(enable_recompute=False) if SDPContext else contextlib.nullcontext(): - attn_output = FusedSDPA.apply( + kv_length = present[0][-2] if reuse_cache else present[0].shape[-2] + + if alibi is None: + if output_attentions: + attention_scores = query_layer @ key_layer.transpose(-1, -2) + attention_scores /= math.sqrt(self.head_dim) + + attention_scores = F.softmax(attention_scores + attention_mask, dim=-1, dtype=hidden_states.dtype) + # It is unclear why neither dropout nor head_mask is applied here (while it is with alibi). + attn_output = attention_scores @ value_layer + else: + if FusedSDPA: + if os.getenv("QUANT_CONFIG", ""): + attn_output = self.sdpa( + query_layer, key_layer, value_layer, attention_mask, 0.0, is_causal=False + ) + else: + with sdp_kernel(enable_recompute=False) if SDPContext else contextlib.nullcontext(): + attn_output = FusedSDPA.apply( + query_layer, + key_layer, + value_layer, + attention_mask, + 0.0, + # The query_length > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case query_length == 1. + self.is_causal and attention_mask is None and query_length > 1, + ) + else: + # Workaround util scaled_dot_product_attention support broadcast. + if self.training is True and query_layer.shape != key_layer.shape: + key_layer = torch.broadcast_to(key_layer, query_layer.shape) + value_layer = torch.broadcast_to(value_layer, query_layer.shape) + attn_output = F.scaled_dot_product_attention( query_layer, key_layer, value_layer, attention_mask, 0.0, # The query_length > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case query_length == 1. - self.is_causal and attention_mask is None and query_length > 1, + is_causal=self.is_causal and attention_mask is None and query_length > 1, ) - else: - # Workaround util scaled_dot_product_attention support broadcast. - if self.training is True and query_layer.shape != key_layer.shape: - key_layer = torch.broadcast_to(key_layer, query_layer.shape) - value_layer = torch.broadcast_to(value_layer, query_layer.shape) - attn_output = F.scaled_dot_product_attention( - query_layer, - key_layer, - value_layer, - attention_mask, - 0.0, - # The query_length > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case query_length == 1. - is_causal=self.is_causal and attention_mask is None and query_length > 1, - ) - # Performance improvement for HPU - if self.training is True and htcore: - htcore.mark_step() - attention_scores = None + # Performance improvement for HPU + if self.training is True and htcore: + htcore.mark_step() + attention_scores = None - attn_output = attn_output.view(batch_size, -1, query_length, self.head_dim) - attn_output = attn_output.permute(0, 2, 1, 3) - attn_output = attn_output.reshape(batch_size, query_length, -1) + attn_output = attn_output.view(batch_size, -1, query_length, self.head_dim) + attn_output = attn_output.permute(0, 2, 1, 3) + attn_output = attn_output.reshape(batch_size, query_length, -1) - attn_output = self.dense(attn_output) + attn_output = self.dense(attn_output) - if output_attentions: - return attn_output, present, attention_scores - else: - return attn_output, present + if output_attentions: + return attn_output, present, attention_scores + else: + return attn_output, present, _ - else: - if self._use_sdpa and not output_attentions and head_mask is None: - if FusedSDPA: - with sdp_kernel(enable_recompute=False) if SDPContext else contextlib.nullcontext(): - attn_output = FusedSDPA.apply( + else: + if self._use_sdpa and not output_attentions and head_mask is None: + if FusedSDPA: + with sdp_kernel(enable_recompute=False) if SDPContext else contextlib.nullcontext(): + attn_output = FusedSDPA.apply( + query_layer, + key_layer, + value_layer, + attention_mask, + self.attention_dropout.p if self.training else 0.0, + self.is_causal and attention_mask is None and query_length > 1, + ) + else: + attn_output = F.scaled_dot_product_attention( query_layer, key_layer, value_layer, - attention_mask, - self.attention_dropout.p if self.training else 0.0, - self.is_causal and attention_mask is None and query_length > 1, + attn_mask=attention_mask, + dropout_p=self.attention_dropout.p if self.training else 0.0, + is_causal=self.is_causal and attention_mask is None and query_length > 1, ) + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim) + + attn_output = self.dense(attn_output) else: - attn_output = F.scaled_dot_product_attention( - query_layer, - key_layer, - value_layer, - attn_mask=attention_mask, - dropout_p=self.attention_dropout.p if self.training else 0.0, - is_causal=self.is_causal and attention_mask is None and query_length > 1, - ) - attn_output = attn_output.transpose(1, 2) - attn_output = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim) + matmul_result = query_layer @ key_layer.transpose(-1, -2) - attn_output = self.dense(attn_output) - else: - matmul_result = query_layer @ key_layer.transpose(-1, -2) + # change view to [batch_size, num_heads, q_length, kv_length] + attention_scores = matmul_result.view(batch_size, self.num_heads, query_length, kv_length) - # change view to [batch_size, num_heads, q_length, kv_length] - attention_scores = matmul_result.view(batch_size, self.num_heads, query_length, kv_length) + # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length] + input_dtype = attention_scores.dtype + # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32` have a minimum value of `-3.4e+38` + if input_dtype == torch.float16 or input_dtype == torch.bfloat16: + attention_scores = attention_scores.to(torch.float32) - # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length] - input_dtype = attention_scores.dtype - # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32` have a minimum value of `-3.4e+38` - if input_dtype == torch.float16 or input_dtype == torch.bfloat16: - attention_scores = attention_scores.to(torch.float32) + attention_logits = attention_scores + alibi.view(batch_size, self.num_heads, 1, -1) + attention_logits *= self.inv_norm_factor + attention_probs = F.softmax(attention_logits + attention_mask, dim=-1, dtype=hidden_states.dtype) + # [batch_size, num_heads, q_length, kv_length] + attention_probs = self.attention_dropout(attention_probs) - attention_logits = attention_scores + alibi.view(batch_size, self.num_heads, 1, -1) - attention_logits *= self.inv_norm_factor - attention_probs = F.softmax(attention_logits + attention_mask, dim=-1, dtype=hidden_states.dtype) - # [batch_size, num_heads, q_length, kv_length] - attention_probs = self.attention_dropout(attention_probs) + if head_mask is not None: + attention_probs = attention_probs * head_mask - if head_mask is not None: - attention_probs = attention_probs * head_mask + # change view [batch_size, num_heads, q_length, kv_length] + attention_probs_reshaped = attention_probs.view(batch_size, self.num_heads, query_length, kv_length) - # change view [batch_size, num_heads, q_length, kv_length] - attention_probs_reshaped = attention_probs.view(batch_size, self.num_heads, query_length, kv_length) + # matmul: [batch_size * num_heads, q_length, head_dim] + attn_output = (attention_probs_reshaped @ value_layer).flatten(0, 1) - # matmul: [batch_size * num_heads, q_length, head_dim] - attn_output = (attention_probs_reshaped @ value_layer).flatten(0, 1) + # change view [batch_size, q_length, num_heads * head_dim] + attn_output = self._merge_heads(attn_output) - # change view [batch_size, q_length, num_heads * head_dim] - attn_output = self._merge_heads(attn_output) + attn_output = self.dense(attn_output) - attn_output = self.dense(attn_output) + if output_attentions: + return attn_output, present, attention_probs + else: + return attn_output, present, _ - if output_attentions: - return attn_output, present, attention_probs - else: - return attn_output, present - - -def gaudi_falcon_decoder_layer_forward( - self, - hidden_states: torch.Tensor, - alibi: Optional[torch.Tensor], - attention_mask: torch.Tensor, - position_ids: Optional[torch.LongTensor] = None, - layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, - head_mask: Optional[torch.Tensor] = None, - use_cache: bool = False, - output_attentions: bool = False, - token_idx: Optional[torch.Tensor] = None, - **kwargs, -): + def attention_all_reduce(self, attn_output): + if hasattr(self.dense, "all_reduce"): + self.dense.all_reduce(attn_output) + + def post_attn_forward(self, attn_output): + if hasattr(self.dense, "all_reduce"): + self.dense.post_all_reduce(attn_output) + return attn_output + + +class GaudiFalconMLP(FalconMLP): + """ + Inherits from FalconMLP: https://github.com/huggingface/transformers/blob/main/src/transformers/models/falcon/modeling_falcon.py + """ + + def pre_mlp_forward(self, x): + x = self.act(self.dense_h_to_4h(x)) + x = self.dense_4h_to_h(x) + return x + + def mlp_all_reduce(self, x): + if hasattr(self.dense_4h_to_h, "all_reduce"): + self.dense_4h_to_h.all_reduce(x) + + def post_mlp_forward(self, x): + if hasattr(self.dense_4h_to_h, "all_reduce"): + self.dense_4h_to_h.post_all_reduce(x) + return x + + +class GaudiFalconDecoderLayer(FalconDecoderLayer): """ - Copied from FalconDecoderLayer.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/falcon/modeling_falcon.py + Inherits from FalconDecoderLayer: https://github.com/huggingface/transformers/blob/main/src/transformers/models/falcon/modeling_falcon.py The only differences are: - add new args token_idx and position_ids - add token_idx and position_ids into attention inputs + - add new args reuse_cache """ - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - residual = hidden_states + def __init__(self, config: FalconConfig): + super().__init__(config) + self.self_attention = GaudiFalconAttention(config) - if self.config.new_decoder_architecture: - attention_layernorm_out = self.ln_attn(hidden_states) - mlp_layernorm_out = self.ln_mlp(hidden_states) - else: - attention_layernorm_out = self.input_layernorm(hidden_states) - - # Self attention. - attn_outputs = self.self_attention( - attention_layernorm_out, - layer_past=layer_past, - attention_mask=attention_mask, - position_ids=position_ids, - alibi=alibi, - head_mask=head_mask, - use_cache=use_cache, - output_attentions=output_attentions, - token_idx=token_idx, + def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len): + self.self_attention.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len) + + def update_sincos_cache(self, seq_len): + self.self_attention.update_sincos_cache(seq_len) + + def forward( + self, + hidden_states: torch.Tensor, + alibi: Optional[torch.Tensor], + attention_mask: torch.Tensor, + position_ids: Optional[torch.LongTensor] = None, + layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + head_mask: Optional[torch.Tensor] = None, + use_cache: bool = False, + output_attentions: bool = False, + token_idx: Optional[torch.Tensor] = None, + reuse_cache: Optional[bool] = False, + cache_idx: int = None, **kwargs, - ) + ): + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + residual = hidden_states + ( + hidden_states, + present, + attn_scores, + attention_layernorm_out, + mlp_layernorm_out, + ) = self.pre_attn( # layernorm + attention before AllReduce + hidden_states, + layer_past=layer_past, + attention_mask=attention_mask, + position_ids=position_ids, + alibi=alibi, + head_mask=head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + token_idx=token_idx, + reuse_cache=reuse_cache, + cache_idx=cache_idx, + **kwargs, + ) - attention_output = attn_outputs[0] + self.self_attention.attention_all_reduce(hidden_states) + hidden_states = self.self_attention.post_attn_forward(hidden_states) - if not self.config.new_decoder_architecture: - if self.config.parallel_attn: - mlp_layernorm_out = attention_layernorm_out - else: - residual = dropout_add(attention_output, residual, self.config.attention_dropout, training=self.training) - mlp_layernorm_out = self.post_attention_layernorm(residual) + attention_output = hidden_states - outputs = attn_outputs[1:] + if not self.config.new_decoder_architecture: + if self.config.parallel_attn: + mlp_layernorm_out = attention_layernorm_out + else: + residual = dropout_add( + attention_output, residual, self.config.attention_dropout, training=self.training + ) + mlp_layernorm_out = self.post_attention_layernorm(residual) - # MLP. - mlp_output = self.mlp(mlp_layernorm_out) + outputs = (present, attn_scores) - if self.config.new_decoder_architecture or self.config.parallel_attn: - mlp_output += attention_output + hidden_states = self.mlp.pre_mlp_forward(mlp_layernorm_out) + self.mlp.mlp_all_reduce(hidden_states) + hidden_states = self.mlp.post_mlp_forward(hidden_states) - output = dropout_add(mlp_output, residual, self.config.hidden_dropout, training=self.training) + if self.config.new_decoder_architecture or self.config.parallel_attn: + hidden_states += attention_output - if use_cache: - outputs = (output,) + outputs - else: - outputs = (output,) + outputs[1:] + output = dropout_add(hidden_states, residual, self.config.hidden_dropout, training=self.training) + + if use_cache: + outputs = (output,) + outputs + else: + outputs = (output,) + outputs[1:] + + return outputs # hidden_states, present, attentions + + def pre_attn( + self, + hidden_states: torch.Tensor, + alibi: Optional[torch.Tensor], + attention_mask: torch.Tensor, + position_ids: Optional[torch.LongTensor] = None, + layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + head_mask: Optional[torch.Tensor] = None, + use_cache: bool = False, + output_attentions: bool = False, + token_idx: Optional[torch.Tensor] = None, + reuse_cache: Optional[bool] = False, + cache_idx: int = None, + ): + if self.config.new_decoder_architecture: + attention_layernorm_out = self.ln_attn(hidden_states) + mlp_layernorm_out = self.ln_mlp(hidden_states) + else: + attention_layernorm_out = self.input_layernorm(hidden_states) + mlp_layernorm_out = None - return outputs # hidden_states, present, attentions + # Self attention. + attn_outputs, present, attn_scores = self.self_attention.pre_attn_forward( + attention_layernorm_out, + layer_past=layer_past, + attention_mask=attention_mask, + position_ids=position_ids, + alibi=alibi, + head_mask=head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + token_idx=token_idx, + reuse_cache=reuse_cache, + cache_idx=cache_idx, + ) + + return attn_outputs, present, attn_scores, attention_layernorm_out, mlp_layernorm_out class GaudiFalconModel(FalconModel): @@ -370,11 +636,17 @@ class GaudiFalconModel(FalconModel): The only differences are: - add new args token_idx and position_ids - add token_idx and position_ids into decoder inputs - - set past_key_values_length=0 when token_idx is used (with static input shape) - - add new arg tgt_len to _expand_mask because past_key_values_length is no longer valid with token_idx - - use old version of _make_causal_mask to workaround toch.triu that is not supported in Synapse + - add new arg reuse_cache """ + def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len): + for layer in self.h: + layer.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len) + + def update_sincos_cache(self, seq_len): + for layer in self.h: + layer.update_sincos_cache(seq_len) + def forward( self, input_ids: Optional[torch.LongTensor] = None, @@ -388,6 +660,8 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, token_idx: Optional[torch.Tensor] = None, + reuse_cache: Optional[bool] = False, + cache_idx: int = None, ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( @@ -426,7 +700,10 @@ def forward( # Compute alibi tensor: check build_alibi_tensor documentation past_key_values_length = 0 if past_key_values[0] is not None and token_idx is None: - past_key_values_length = past_key_values[0][0].shape[-2] + if reuse_cache: + past_key_values_length = past_key_values[0][0][-2] + else: + past_key_values_length = past_key_values[0][0].shape[-2] if self.use_alibi: mask = ( @@ -489,6 +766,7 @@ def forward( attention_mask = _gaudi_prepare_4d_causal_attention_mask( attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length ) + else: # 4d mask is passed through the layers attention_mask = _gaudi_prepare_4d_causal_attention_mask( @@ -501,6 +779,7 @@ def forward( # head_mask has shape n_layer x batch x num_heads x N x N head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + htcore.mark_step() for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) @@ -529,6 +808,8 @@ def forward( output_attentions=output_attentions, alibi=alibi, token_idx=token_idx, + reuse_cache=reuse_cache, + cache_idx=cache_idx, ) hidden_states = outputs[0] @@ -563,8 +844,16 @@ class GaudiFalconForCausalLM(FalconForCausalLM): - add token_idx and position_ids into model inputs - from step2 when enable KV cache, slice next_input_ids from input_ids base on the token_idx - from step2 when enable KV cache, slice next_position_ids from position_ids base on the token_idx + - add new args reuse_cache """ + def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len): + self.transformer.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len) + self.kv_cache_len = max_seq_len + + def update_sincos_cache(self, seq_len): + self.transformer.update_sincos_cache(seq_len) + def prepare_inputs_for_generation( self, input_ids: torch.LongTensor, @@ -574,6 +863,7 @@ def prepare_inputs_for_generation( token_idx: Optional[torch.Tensor] = None, **kwargs, ) -> dict: + reuse_cache = kwargs.get("reuse_cache") if past_key_values is not None: if token_idx is not None: input_ids = torch.index_select(input_ids, 1, token_idx - 1) @@ -588,6 +878,10 @@ def prepare_inputs_for_generation( remove_prefix_length = input_ids.shape[1] - 1 input_ids = input_ids[:, remove_prefix_length:] + elif reuse_cache and token_idx is not None: + # With reuse_cache, KV cache is pre allocated hence for the 1st token we can slice the inputs till token idx for the fwd pass + input_ids = input_ids[:, :token_idx] + attention_mask = attention_mask[:, :token_idx] # Note: versions of Falcon with alibi do not use position_ids. It is used with RoPE. if ( @@ -612,6 +906,8 @@ def prepare_inputs_for_generation( "use_cache": kwargs.get("use_cache"), "attention_mask": attention_mask, "token_idx": token_idx, + "reuse_cache": reuse_cache, + "cache_idx": kwargs.get("cache_idx"), } def forward( @@ -628,6 +924,9 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, token_idx: Optional[torch.Tensor] = None, + reuse_cache: Optional[bool] = False, + trim_logits: Optional[bool] = False, + cache_idx: int = None, ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): @@ -649,9 +948,18 @@ def forward( output_hidden_states=output_hidden_states, return_dict=return_dict, token_idx=token_idx, + reuse_cache=reuse_cache, + cache_idx=cache_idx, ) hidden_states = transformer_outputs[0] + _, seq_len, _ = hidden_states.shape + if seq_len > 1 and trim_logits and not self.training: + if token_idx is not None: + hidden_states = hidden_states.index_select(1, token_idx - 1) + else: + hidden_states = hidden_states[:, -1:, :] + lm_logits = self.lm_head(hidden_states) loss = None diff --git a/tests/test_text_generation_example.py b/tests/test_text_generation_example.py index af7e9cfab3..8664dfa2e8 100644 --- a/tests/test_text_generation_example.py +++ b/tests/test_text_generation_example.py @@ -27,6 +27,9 @@ ("mistralai/Mixtral-8x7B-v0.1", 23.78652574031883), ("microsoft/phi-2", 218.08752713569007), ], + "fp8": [ + ("tiiuae/falcon-180B", 47.67900945905787), + ], "deepspeed": [ ("bigscience/bloomz", 36.34664210641816), ("meta-llama/Llama-2-70b-hf", 61.973950428647164), @@ -71,6 +74,7 @@ def _test_text_generation( deepspeed: bool = False, world_size: int = 8, torch_compile: bool = False, + fp8: bool = False, ): command = ["python3"] path_to_example_dir = Path(__file__).resolve().parent.parent / "examples" @@ -108,6 +112,12 @@ def _test_text_generation( if not deepspeed: command.append("--bf16") + if fp8: + command += [ + "--reuse_cache", + "--trim_logits", + ] + with TemporaryDirectory() as tmp_dir: command.append(f"--output_dir {tmp_dir}") print(f"\n\nCommand to test: {' '.join(command)}\n") @@ -117,6 +127,16 @@ def _test_text_generation( pattern = re.compile(r"([\"\'].+?[\"\'])|\s") command = [x for y in command for x in re.split(pattern, y) if x] + if fp8: + env_variables["QUANT_CONFIG"] = os.path.join( + path_to_example_dir, "text-generation/quantization_config/maxabs_measure_include_outputs.json" + ) + subprocess.run(command, env=env_variables) + env_variables["QUANT_CONFIG"] = os.path.join( + path_to_example_dir, "text-generation/quantization_config/maxabs_quant.json" + ) + command.append("--fp8") + proc = subprocess.run(command, env=env_variables) # Ensure the run finished without any issue @@ -140,6 +160,13 @@ def test_text_generation_bf16(model_name: str, baseline: float, token: str): _test_text_generation(model_name, baseline, token) +@pytest.mark.parametrize("model_name, baseline", MODELS_TO_TEST["fp8"]) +def test_text_generation_fp8(model_name: str, baseline: float, token: str): + deepspeed = True if "falcon-180B" in model_name else False + world_size = 8 if "falcon-180B" in model_name else None + _test_text_generation(model_name, baseline, token, deepspeed=deepspeed, world_size=world_size, fp8=True) + + @pytest.mark.parametrize("model_name, baseline", MODELS_TO_TEST["deepspeed"]) def test_text_generation_deepspeed(model_name: str, baseline: float, token: str): world_size = 2 if "opt-66b" in model_name else 8