diff --git a/examples/text-generation/README.md b/examples/text-generation/README.md index 63ee4d6a33..cafe16d897 100644 --- a/examples/text-generation/README.md +++ b/examples/text-generation/README.md @@ -287,3 +287,7 @@ deepspeed --num_gpus 8 run_lm_eval.py \ --tasks winogrande \ -o eval.json ``` + +## Text-Generation Pipeline + +A Transformers-like pipeline is defined and provided [here](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation/text-generation-pipeline). It is optimized for Gaudi and can be called to generate text in your scripts. diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py index 445794048f..ab308e7023 100644 --- a/examples/text-generation/run_generation.py +++ b/examples/text-generation/run_generation.py @@ -231,6 +231,8 @@ def setup_parser(parser): action="store_true", help="Whether to enable Habana Flash Attention, provided that the model supports it.", ) + parser.add_argument("--temperature", default=1.0, type=float, help="Temperature value for text generation") + parser.add_argument("--top_p", default=1.0, type=float, help="Top_p value for generating text via sampling") args = parser.parse_args() diff --git a/examples/text-generation/text-generation-pipeline/README.md b/examples/text-generation/text-generation-pipeline/README.md new file mode 100644 index 0000000000..2fc93a6ca2 --- /dev/null +++ b/examples/text-generation/text-generation-pipeline/README.md @@ -0,0 +1,127 @@ + + +# Text-Generation Pipeline + +The text-generation pipeline can be used to perform text-generation by providing single or muliple prompts as input. + +## Requirements + +Update `PYTHONPATH` as follows. +```bash +export OPTIMUM_HABANA_PATH=/path/to/optimum-habana +export PYTHONPATH=${PYTHONPATH}:${OPTIMUM_HABANA_PATH}/examples/text-generation +``` + +If you plan to use [DeepSpeed-inference](https://docs.habana.ai/en/latest/PyTorch/DeepSpeed/Inference_Using_DeepSpeed.html), you should install DeepSpeed as follows: +```bash +pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.13.0 +``` + +## Usage + +To run generation with DeepSpeed-inference, you must launch the script as follows: + +```bash +python ../../gaudi_spawn.py --use_deepspeed --world_size number_of_devices run_pipeline.py ARGS +``` + +Without DeepSpeed-inference, you can run the script with: + +```bash +python run_pipeline.py ARGS +``` + +The list of all possible arguments can be obtained running: +```bash +python run_pipeline.py --help +``` + + +### Single and multiple prompts + +If you want to generate a sequence of text from a prompt of your choice, you should use the `--prompt` argument. +For example: +``` +python run_pipeline.py \ +--model_name_or_path meta-llama/Llama-2-7b-hf \ +--use_hpu_graphs \ +--use_kv_cache \ +--max_new_tokens 100 \ +--do_sample \ +--prompt "Here is my prompt" +``` + +If you want to provide several prompts as inputs, here is how to do it: +``` +python run_pipeline.py \ +--model_name_or_path meta-llama/Llama-2-7b-hf \ +--use_hpu_graphs \ +--use_kv_cache \ +--max_new_tokens 100 \ +--do_sample \ +--prompt "Hello world" "How are you?" +``` + +If you want to perform generation on default prompts, do not pass the `--prompt` argument. +``` +python run_pipeline.py \ +--model_name_or_path meta-llama/Llama-2-7b-hf \ +--use_hpu_graphs \ +--use_kv_cache \ +--max_new_tokens 100 \ +--do_sample +``` + +If you want to change the temperature and top_p values, make sure to include the `--do_sample` argument. Here is a sample command. +``` +python run_pipeline.py \ +--model_name_or_path meta-llama/Llama-2-7b-hf \ +--use_hpu_graphs \ +--use_kv_cache \ +--max_new_tokens 100 \ +--do_sample \ +--temperature 0.5 \ +--top_p 0.95 \ +--prompt "Hello world" "How are you?" +``` + +### Multi-card runs + +To run a large model such as Llama-2-70b via DeepSpeed, run the following command. +``` +python ../../gaudi_spawn.py --use_deepspeed --world_size 8 run_pipeline.py \ +--model_name_or_path meta-llama/Llama-2-70b-hf \ +--max_new_tokens 100 \ +--bf16 \ +--use_hpu_graphs \ +--use_kv_cache \ +--prompt "Hello world" "How are you?" "Here is my prompt" "Once upon a time" +``` + +To change the temperature and top_p values, run the following command. +``` +python ../../gaudi_spawn.py --use_deepspeed --world_size 8 run_pipeline.py \ +--model_name_or_path meta-llama/Llama-2-70b-hf \ +--max_new_tokens 100 \ +--bf16 \ +--use_hpu_graphs \ +--use_kv_cache \ +--do_sample \ +--temperature 0.5 \ +--top_p 0.95 \ +--prompt "Hello world" "How are you?" "Here is my prompt" "Once upon a time" +``` diff --git a/examples/text-generation/text-generation-pipeline/pipeline.py b/examples/text-generation/text-generation-pipeline/pipeline.py new file mode 100644 index 0000000000..0c2905a731 --- /dev/null +++ b/examples/text-generation/text-generation-pipeline/pipeline.py @@ -0,0 +1,47 @@ +import torch +from transformers import TextGenerationPipeline +from utils import initialize_model + + +class GaudiTextGenerationPipeline(TextGenerationPipeline): + def __init__(self, args, logger): + self.model, self.tokenizer, self.generation_config = initialize_model(args, logger) + + self.device = args.device + + if args.do_sample: + self.generation_config.temperature = args.temperature + self.generation_config.top_p = args.top_p + + self.max_padding_length = args.max_input_tokens if args.max_input_tokens > 0 else 100 + self.use_hpu_graphs = args.use_hpu_graphs + self.profiling_steps = args.profiling_steps + self.profiling_warmup_steps = args.profiling_warmup_steps + + import habana_frameworks.torch.hpu as torch_hpu + + logger.info("Graph compilation...") + for _ in range(3): + self("Here is my prompt") + torch_hpu.synchronize() + + def __call__(self, prompt: str): + model_inputs = self.tokenizer.encode_plus( + prompt, return_tensors="pt", max_length=self.max_padding_length, padding="max_length", truncation=True + ) + + for t in model_inputs: + if torch.is_tensor(model_inputs[t]): + model_inputs[t] = model_inputs[t].to(self.device) + + output = self.model.generate( + **model_inputs, + generation_config=self.generation_config, + lazy_mode=True, + hpu_graphs=self.use_hpu_graphs, + profiling_steps=self.profiling_steps, + profiling_warmup_steps=self.profiling_warmup_steps, + ).cpu() + + output_text = self.tokenizer.decode(output[0], skip_special_tokens=True) + return output_text diff --git a/examples/text-generation/text-generation-pipeline/run_pipeline.py b/examples/text-generation/text-generation-pipeline/run_pipeline.py new file mode 100644 index 0000000000..03bbaa6e91 --- /dev/null +++ b/examples/text-generation/text-generation-pipeline/run_pipeline.py @@ -0,0 +1,52 @@ +import argparse +import logging +import time + +from pipeline import GaudiTextGenerationPipeline +from run_generation import setup_parser + + +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, +) +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser() + args = setup_parser(parser) + args.num_return_sequences = 1 + + if args.prompt: + input_sentences = args.prompt + else: + input_sentences = [ + "DeepSpeed is a machine learning framework", + "He is working on", + "He has a", + "He got all", + "Everyone is happy and I can", + "The new movie that got Oscar this year", + "In the far far distance from our galaxy,", + "Peace is the only way", + ] + + logger.info("Initializing text-generation pipeline...") + pipe = GaudiTextGenerationPipeline(args, logger) + + logger.info("Running inference...") + for input_sentence in input_sentences: + print(f"Prompt: {input_sentence}") + t0 = time.perf_counter() + output = pipe(input_sentence) + duration = time.perf_counter() - t0 + throughput = args.max_new_tokens / duration + print(f"Generated Text: {repr(output)}") + print(f"Inference Duration: {duration} seconds") + print(f"Throughput: {throughput} tokens/second") + + +if __name__ == "__main__": + main()