huggingface · regisss · Jan 4, 2024 · Nov 9, 2023 · Nov 14, 2023 · Nov 15, 2023
@@ -287,3 +287,7 @@ deepspeed --num_gpus 8 run_lm_eval.py \
 --tasks winogrande \
 -o eval.json
 ```
+
+## Text-Generation Pipeline
+
+A Transformers-like pipeline is defined and provided [here](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation/text-generation-pipeline). It is optimized for Gaudi and can be called to generate text in your scripts.
@@ -231,6 +231,8 @@ def setup_parser(parser):
         action="store_true",
         help="Whether to enable Habana Flash Attention, provided that the model supports it.",
     )
+    parser.add_argument("--temperature", default=1.0, type=float, help="Temperature value for text generation")
+    parser.add_argument("--top_p", default=1.0, type=float, help="Top_p value for generating text via sampling")
 
     args = parser.parse_args()
 

@@ -0,0 +1,127 @@
+<!---
+Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Text-Generation Pipeline
+
+The text-generation pipeline can be used to perform text-generation by providing single or muliple prompts as input.
+
+## Requirements
+
+Update `PYTHONPATH` as follows.
+```bash
+export OPTIMUM_HABANA_PATH=/path/to/optimum-habana
+export PYTHONPATH=${PYTHONPATH}:${OPTIMUM_HABANA_PATH}/examples/text-generation
+```
+
+If you plan to use [DeepSpeed-inference](https://docs.habana.ai/en/latest/PyTorch/DeepSpeed/Inference_Using_DeepSpeed.html), you should install DeepSpeed as follows:
+```bash
+pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.13.0
+```
+
+## Usage
+
+To run generation with DeepSpeed-inference, you must launch the script as follows:
+
+```bash
+python ../../gaudi_spawn.py --use_deepspeed --world_size number_of_devices run_pipeline.py ARGS
+```
+
+Without DeepSpeed-inference, you can run the script with:
+
+```bash
+python run_pipeline.py ARGS
+```
+
+The list of all possible arguments can be obtained running:
+```bash
+python run_pipeline.py --help
+```
+
+
+### Single and multiple prompts
+
+If you want to generate a sequence of text from a prompt of your choice, you should use the `--prompt` argument.
+For example:
+```
+python run_pipeline.py \
+--model_name_or_path meta-llama/Llama-2-7b-hf \
+--use_hpu_graphs \
+--use_kv_cache \
+--max_new_tokens 100 \
+--do_sample \
+--prompt "Here is my prompt"
+```
+
+If you want to provide several prompts as inputs, here is how to do it:
+```
+python run_pipeline.py \
+--model_name_or_path meta-llama/Llama-2-7b-hf \
+--use_hpu_graphs \
+--use_kv_cache \
+--max_new_tokens 100 \
+--do_sample \
+--prompt "Hello world" "How are you?"
+```
+
+If you want to perform generation on default prompts, do not pass the `--prompt` argument.
+```
+python run_pipeline.py \
+--model_name_or_path meta-llama/Llama-2-7b-hf \
+--use_hpu_graphs \
+--use_kv_cache \
+--max_new_tokens 100 \
+--do_sample
+```
+
+If you want to change the temperature and top_p values, make sure to include the `--do_sample` argument. Here is a sample command.
+```
+python run_pipeline.py \
+--model_name_or_path meta-llama/Llama-2-7b-hf \
+--use_hpu_graphs \
+--use_kv_cache \
+--max_new_tokens 100 \
+--do_sample \
+--temperature 0.5 \
+--top_p 0.95 \
+--prompt "Hello world" "How are you?"
+```
+
+### Multi-card runs
+
+To run a large model such as Llama-2-70b via DeepSpeed, run the following command.
+```
+python ../../gaudi_spawn.py --use_deepspeed --world_size 8 run_pipeline.py \
+--model_name_or_path meta-llama/Llama-2-70b-hf \
+--max_new_tokens 100 \
+--bf16 \
+--use_hpu_graphs \
+--use_kv_cache \
+--prompt "Hello world" "How are you?" "Here is my prompt" "Once upon a time"
+```
+
+To change the temperature and top_p values, run the following command.
+```
+python ../../gaudi_spawn.py --use_deepspeed --world_size 8 run_pipeline.py \
+--model_name_or_path meta-llama/Llama-2-70b-hf \
+--max_new_tokens 100 \
+--bf16 \
+--use_hpu_graphs \
+--use_kv_cache \
+--do_sample \
+--temperature 0.5 \
+--top_p 0.95 \
+--prompt "Hello world" "How are you?" "Here is my prompt" "Once upon a time"
+```
@@ -0,0 +1,47 @@
+import torch
+from transformers import TextGenerationPipeline
+from utils import initialize_model
+
+
+class GaudiTextGenerationPipeline(TextGenerationPipeline):
+    def __init__(self, args, logger):
+        self.model, self.tokenizer, self.generation_config = initialize_model(args, logger)
+
+        self.device = args.device
+
+        if args.do_sample:
+            self.generation_config.temperature = args.temperature
+            self.generation_config.top_p = args.top_p
+
+        self.max_padding_length = args.max_input_tokens if args.max_input_tokens > 0 else 100
+        self.use_hpu_graphs = args.use_hpu_graphs
+        self.profiling_steps = args.profiling_steps
+        self.profiling_warmup_steps = args.profiling_warmup_steps
+
+        import habana_frameworks.torch.hpu as torch_hpu
+
+        logger.info("Graph compilation...")
+        for _ in range(3):
+            self("Here is my prompt")
+        torch_hpu.synchronize()
+
+    def __call__(self, prompt: str):
+        model_inputs = self.tokenizer.encode_plus(
+            prompt, return_tensors="pt", max_length=self.max_padding_length, padding="max_length", truncation=True
+        )
+
+        for t in model_inputs:
+            if torch.is_tensor(model_inputs[t]):
+                model_inputs[t] = model_inputs[t].to(self.device)
+
+        output = self.model.generate(
+            **model_inputs,
+            generation_config=self.generation_config,
+            lazy_mode=True,
+            hpu_graphs=self.use_hpu_graphs,
+            profiling_steps=self.profiling_steps,
+            profiling_warmup_steps=self.profiling_warmup_steps,
+        ).cpu()
+
+        output_text = self.tokenizer.decode(output[0], skip_special_tokens=True)
+        return output_text
@@ -0,0 +1,52 @@
+import argparse
+import logging
+import time
+
+from pipeline import GaudiTextGenerationPipeline
+from run_generation import setup_parser
+
+
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+    datefmt="%m/%d/%Y %H:%M:%S",
+    level=logging.INFO,
+)
+logger = logging.getLogger(__name__)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    args = setup_parser(parser)
+    args.num_return_sequences = 1
+
+    if args.prompt:
+        input_sentences = args.prompt
+    else:
+        input_sentences = [
+            "DeepSpeed is a machine learning framework",
+            "He is working on",
+            "He has a",
+            "He got all",
+            "Everyone is happy and I can",
+            "The new movie that got Oscar this year",
+            "In the far far distance from our galaxy,",
+            "Peace is the only way",
+        ]
+
+    logger.info("Initializing text-generation pipeline...")
+    pipe = GaudiTextGenerationPipeline(args, logger)
+
+    logger.info("Running inference...")
+    for input_sentence in input_sentences:
+        print(f"Prompt: {input_sentence}")
+        t0 = time.perf_counter()
+        output = pipe(input_sentence)
+        duration = time.perf_counter() - t0
+        throughput = args.max_new_tokens / duration
+        print(f"Generated Text: {repr(output)}")
+        print(f"Inference Duration: {duration} seconds")
+        print(f"Throughput: {throughput} tokens/second")
+
+
+if __name__ == "__main__":
+    main()