Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion examples/contrastive-image-text/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -250,5 +250,8 @@ python run_clip.py \
--use_lazy_mode \
--use_hpu_graphs_for_inference \
--gaudi_config_name Habana/clip \
--bf16
--bf16 \
--mediapipe_dataloader
```

> `--mediapipe_dataloader` only works on Gaudi2.
60 changes: 26 additions & 34 deletions examples/contrastive-image-text/clip_media_pipe.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -24,29 +24,37 @@

try:
from habana_frameworks.mediapipe import fn
from habana_frameworks.mediapipe.backend.nodes import opnode_tensor_info
from habana_frameworks.mediapipe.backend.operator_specs import schema
from habana_frameworks.mediapipe.media_types import dtype, ftype, imgtype, randomCropType, readerOutType
from habana_frameworks.mediapipe.mediapipe import MediaPipe
from habana_frameworks.mediapipe.operators.media_nodes import MediaReaderNode
from habana_frameworks.mediapipe.operators.reader_nodes.read_image_from_dir import get_max_file
from habana_frameworks.mediapipe.operators.reader_nodes.reader_nodes import (
media_ext_reader_op_impl,
media_ext_reader_op_tensor_info,
)
from habana_frameworks.torch.hpu import get_device_name
except ImportError:
pass

read_image_text_from_dataset_params = {
"label_dtype": dtype.UINT64,
"dataset": None,
}

class read_image_text_from_dataset(MediaReaderNode):

class read_image_text_from_dataset(media_ext_reader_op_impl):
"""
Class defining read image/text from directory node.
Class defining read image/text from clip dataset.

"""

def __init__(self, name, guid, device, inputs, params, cparams, node_attr):
super().__init__(name, guid, device, inputs, params, cparams, node_attr)
def __init__(self, params):
self.batch_size = 1
params = params["priv_params"]
self.meta_dtype = params["label_dtype"]
self.dataset = params["dataset"]
self.epoch = 0

self.batch_sampler_iter = None
self.iter_loc = 0
self.num_imgs_slice = len(ClipMediaPipe.batch_sampler.sampler)
self.num_batches_slice = len(ClipMediaPipe.batch_sampler)

Expand All @@ -62,13 +70,13 @@ def set_params(self, params):

def gen_output_info(self):
out_info = []
o = opnode_tensor_info(dtype.NDT, np.array([self.batch_size], dtype=np.uint32), "")
o = media_ext_reader_op_tensor_info(dtype.NDT, np.array([self.batch_size], dtype=np.uint32), "")
out_info.append(o)
o = opnode_tensor_info(
o = media_ext_reader_op_tensor_info(
self.meta_dtype, np.array([self.dataset.text_max_length, self.batch_size], dtype=np.uint32), ""
)
out_info.append(o)
o = opnode_tensor_info(
o = media_ext_reader_op_tensor_info(
self.meta_dtype, np.array([self.dataset.text_max_length, self.batch_size], dtype=np.uint32), ""
)
out_info.append(o)
Expand Down Expand Up @@ -112,27 +120,6 @@ def __next__(self):
return img_list, input_id_list, attention_mask_list


read_image_text_from_dataset_params = {
"label_dtype": dtype.UINT64,
"dataset": None,
}
schema.add_operator(
"ClipDataReader",
None,
0,
0,
[],
3,
read_image_text_from_dataset_params,
None,
read_image_text_from_dataset,
dtype.NDT,
)
op_class = fn.operator_add("ClipDataReader")
op_class.__module__ = fn.__name__
setattr(fn, "ClipDataReader", op_class)


class ClipMediaPipe(MediaPipe):
"""
Class defining clip media pipe:
Expand Down Expand Up @@ -160,8 +147,13 @@ def __init__(self, dataset=None, sampler=None, batch_size=512, drop_last=False,
super(ClipMediaPipe, self).__init__(
device=self.device, batch_size=batch_size, prefetch_depth=queue_depth, pipe_name=pipe_name
)

self.input = fn.ClipDataReader(label_dtype=dtype.UINT32, dataset=self.dataset)
params = read_image_text_from_dataset_params.copy()
params["dataset"] = self.dataset
self.input = fn.MediaExtReaderOp(
impl=read_image_text_from_dataset,
num_outputs=3,
priv_params=params,
)
def_output_image_size = [self.image_size, self.image_size]
res_pp_filter = ftype.BICUBIC
self.decode = fn.ImageDecoder(
Expand Down
7 changes: 6 additions & 1 deletion examples/language-modeling/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,7 @@ python3 run_lora_clm.py \
--max_grad_norm 0.3 \
--logging_steps 1 \
--do_train \
--do_eval \
--use_habana \
--use_lazy_mode \
--throughput_warmup_steps 3 \
Expand All @@ -380,6 +381,7 @@ python3 run_lora_clm.py \
--dataset_concatenation \
--max_seq_length 512 \
--low_cpu_mem_usage True \
--validation_split_percentage 4 \
--adam_epsilon 1e-08
```

Expand Down Expand Up @@ -436,6 +438,7 @@ python ../gaudi_spawn.py \
--max_grad_norm 0.3 \
--logging_steps 1 \
--do_train \
--do_eval \
--use_habana \
--use_lazy_mode \
--throughput_warmup_steps 3 \
Expand All @@ -447,6 +450,7 @@ python ../gaudi_spawn.py \
--max_seq_length 512 \
--ddp_bucket_cap_mb 50 \
--adam_epsilon 1e-08 \
--validation_split_percentage 4 \
--low_cpu_mem_usage True
```

Expand Down Expand Up @@ -550,7 +554,8 @@ python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_lora_clm.py \
--lora_rank 4 \
--lora_target_modules "q_proj" "v_proj" "k_proj" "o_proj" \
--validation_split_percentage 4 \
--use_flash_attention True
--use_flash_attention True \
--flash_attention_causal_mask True
```

- Multi-card finetuning of Falcon-180B:
Expand Down
6 changes: 5 additions & 1 deletion examples/language-modeling/run_clm.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,9 @@ class DataTrainingArguments:
keep_linebreaks: bool = field(
default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}
)
save_last_ckpt: bool = field(
default=True, metadata={"help": "Whether to save checkpoint at the end of the training."}
)

def __post_init__(self):
if self.streaming:
Expand Down Expand Up @@ -643,7 +646,8 @@ def compute_metrics(eval_preds):
elif last_checkpoint is not None:
checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() # Saves the tokenizer too for easy upload
if data_args.save_last_ckpt:
trainer.save_model() # Saves the tokenizer too for easy upload

metrics = train_result.metrics

Expand Down
20 changes: 20 additions & 0 deletions examples/language-modeling/run_lora_clm.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,23 @@ class ModelArguments:
)
},
)
flash_attention_causal_mask: bool = field(
default=False,
metadata={
"help": (
"Whether to enable causal mask in Habana flash attention for fine-tuning."
" It is applicable only when use_flash_attention is True.",
)
},
)
use_fused_rope: bool = field(
default=True,
metadata={
"help": (
"Whether to use Habana fused-rope for fine-tuning. The current support is limited to Llama only.",
)
},
)
load_meta_device: bool = field(
default=False,
metadata={
Expand Down Expand Up @@ -537,6 +554,9 @@ def main():
if model_args.use_flash_attention:
model.generation_config.use_flash_attention = True
model.generation_config.flash_attention_recompute = model_args.flash_attention_recompute
model.generation_config.flash_attention_causal_mask = model_args.flash_attention_causal_mask
if not model_args.use_fused_rope:
model.generation_config.use_fused_rope = False

if hasattr(model.generation_config, "pad_token_id") and model.generation_config.pad_token_id is not None:
tokenizer.pad_token_id = model.generation_config.pad_token_id
Expand Down
27 changes: 27 additions & 0 deletions examples/text-generation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,9 @@ python run_generation.py \
`--bucket_size` option is especially useful when processing an input stream with varying lengths, that is when you have something like `--dataset_name squad --column_name context --max_input_tokens -1`. `--max_input_tokens -1` specifies no truncation of input prompt in the dataset.

Another way to simulate dynamic input is to use `--simulate_dyn_prompt`. For example `--simulate_dyn_prompt 25,35,45` will extend or crop the default prompt (or the prompt passed in using `--prompt`) to sizes 25, 35, and 45, and throughput will be measured for these 3 lengths. If `--simulate_dyn_prompt` is used, the min and max input lengths from it are computed to perform warmup as well. One final optimization that can be used in case of dynamic inputs is `--reduce_recompile`. Thus the suggested configuration to simulate dynamicity after warmup is to use all three arguments: `--simulate_dyn_prompt 25 35 45 --reduce_recompile --bucket_size 30`

While `--bucket_size` works for any model without model file changes, an even more optimized version of bucketing is supported for certain models like Llama. This can be enabled by setting `--bucket_internal` flag (along with `--bucket_size` to specify the bucket size)

### Running with FP8

Llama2-70b and Llama2-7b in FP8 are enabled using the Quantization Toolkit (HQT), which provides model measurement and quantization capabilities in PyTorch.
Expand Down Expand Up @@ -293,6 +296,30 @@ QUANT_CONFIG=./quantization_config/maxabs_quant.json python ../gaudi_spawn.py \
```
`--fp8` is required to enable quantization in fp8.

### Using Habana Flash Attention

Habana Flash Attention addresses large sequence lenghts on prompt stage of inference. Using causal attention mask on prompt stage requires input sequences in batch to be of the same length, but can provide a memory saving, thus enabling higher batch sizes.

Below example uses `flash_attention_recompute` mode in order to reduce memory consumption on prompt stage. Additionally since all sequences in a batch are of the same lenght it uses `flash_attention_causal_mask` which will further improve performance by taking advantage of specific lower-diagonal shape of inputs to softmax operation.

```bash
python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \
--model_name_or_path meta-llama/Llama-2-70b-hf \
--use_hpu_graphs \
--use_kv_cache \
--reuse_cache \
--trim_logits \
--attn_softmax_bf16 \
--max_input_tokens 31744 \
--max_new_tokens 1024 \
--batch_size=12 \
--use_flash_attention \
--flash_attention_recompute \
--flash_attention_causal_mask \
--book_source
```

For more details see [documentation](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_PyTorch_Models.html#using-fused-sdpa).

## Language Model Evaluation Harness

Expand Down
68 changes: 66 additions & 2 deletions examples/text-generation/run_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,11 @@ def setup_parser(parser):
then we use `shape = prompt_length + max_new_tokens`. If a positive number is passed \
we increase the bucket in steps of `bucket_size` instead of allocating to max (`prompt_length + max_new_tokens`).",
)
parser.add_argument(
"--bucket_internal",
action="store_true",
help="Split kv sequence into buckets in decode phase. It improves throughput when max_new_tokens is large.",
)
parser.add_argument(
"--dataset_max_samples",
default=-1,
Expand Down Expand Up @@ -227,6 +232,21 @@ def setup_parser(parser):
action="store_true",
help="Whether to enable Habana Flash Attention, provided that the model supports it.",
)
parser.add_argument(
"--flash_attention_recompute",
action="store_true",
help="Whether to enable Habana Flash Attention in recompute mode on first token generation. This gives an opportunity of splitting graph internally which helps reduce memory consumption.",
)
parser.add_argument(
"--flash_attention_causal_mask",
action="store_true",
help="Whether to enable Habana Flash Attention in causal mode on first token generation.",
)
parser.add_argument(
"--book_source",
action="store_true",
help="Whether to use project Guttenberg books data as input. Usefull for testing large sequence lenghts.",
)
parser.add_argument(
"--torch_compile",
action="store_true",
Expand Down Expand Up @@ -266,6 +286,45 @@ def main():
# Benchmark over the prompts below
if args.prompt:
input_sentences = args.prompt
elif args.book_source:

def download_book(book_id):
import os

import requests

url = f"https://www.gutenberg.org/cache/epub/{book_id}/pg{book_id}.txt"
response = requests.get(url)
if response.status_code == 200:
pid = os.getpid()
save_path = f"/tmp/{book_id}_{pid}.txt"
with open(save_path, "wb") as file:
file.write(response.content)
print(f"Book downloaded and saved to: {save_path}")
return save_path
else:
print("Failed to download book! Exiting...")
import sys

sys.exit()

def assemble_prompt(prompt_size, book_path):
prompt = ""
counter = 0
book_lines = open(book_path).readlines()
for line in book_lines:
for word in line.split():
counter += 1
prompt += word + " "
if counter == prompt_size:
return [prompt] * args.batch_size

book_ids = [
2701, # Moby Dick; Or, The Whale
1513, # Romeo and Juliet
1342, # Pride and Prejudice
]
input_sentences = assemble_prompt(prompt_size=args.max_input_tokens, book_path=download_book(book_ids[0]))
else:
input_sentences = [
"DeepSpeed is a machine learning framework",
Expand All @@ -289,6 +348,8 @@ def main():
def generate(size=None, reduce_recompile=False):
"""Generates sequences from the input sentences and returns them."""

t0 = time.perf_counter()
print(f"Step4+ starting time is {t0*1000}", flush=True)
# Tokenization
if args.max_input_tokens > 0:
input_tokens = tokenizer.batch_encode_plus(
Expand All @@ -309,15 +370,18 @@ def generate(size=None, reduce_recompile=False):
if torch.is_tensor(input_tokens[t]):
input_tokens[t] = input_tokens[t].to(args.device)

outputs = model.generate(
output_tokens = model.generate(
**input_tokens,
generation_config=generation_config,
lazy_mode=use_lazy_mode,
hpu_graphs=args.use_hpu_graphs,
profiling_steps=args.profiling_steps,
profiling_warmup_steps=args.profiling_warmup_steps,
).cpu()
return tokenizer.batch_decode(outputs, skip_special_tokens=True)
outputs = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)
duration = time.perf_counter() - t0
print(f"Total E2E time of this iteration is {duration:.3f}s", flush=True)
return outputs

from optimum.habana.utils import HabanaProfile

Expand Down
3 changes: 3 additions & 0 deletions examples/text-generation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,7 @@ def setup_generation_config(args, model, tokenizer):
generation_config.use_cache = args.use_kv_cache
generation_config.static_shapes = is_optimized
generation_config.bucket_size = args.bucket_size if is_optimized else -1
generation_config.bucket_internal = args.bucket_internal
generation_config.do_sample = args.do_sample
generation_config.num_beams = args.num_beams
generation_config.bad_words_ids = bad_words_ids
Expand All @@ -343,6 +344,8 @@ def setup_generation_config(args, model, tokenizer):
assert generation_config.bucket_size > 0
generation_config.kv_cache_fp8 = args.kv_cache_fp8
generation_config.use_flash_attention = args.use_flash_attention
generation_config.flash_attention_recompute = args.flash_attention_recompute
generation_config.flash_attention_causal_mask = args.flash_attention_causal_mask
return generation_config


Expand Down
4 changes: 3 additions & 1 deletion optimum/habana/accelerate/utils/dataclasses.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ class GaudiDynamoBackend(str, BaseEnum):
- **IPEX** -- Uses IPEX for inference on CPU. Inference only. [Read
more](https://github.com/intel/intel-extension-for-pytorch).
- **TVM** -- Uses Apach TVM for inference optimizations. [Read more](https://tvm.apache.org/)
- **AOT_HPU_TRAINING_BACKEND** -- Uses Habana Gaudi.
- **AOT_HPU_TRAINING_BACKEND** -- Uses Habana Gaudi - depracated - will be removed.
- **HPU_BACKEND** -- Uses Habana Gaudi.

"""

Expand All @@ -92,6 +93,7 @@ class GaudiDynamoBackend(str, BaseEnum):
IPEX = "IPEX"
TVM = "TVM"
AOT_HPU_TRAINING_BACKEND = "AOT_HPU_TRAINING_BACKEND"
HPU_BACKEND = "HPU_BACKEND"


@dataclass
Expand Down
Loading