-
Notifications
You must be signed in to change notification settings - Fork 270
Tests for text gen output text #1411
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
07df48a
610dfe0
6a506b2
28eaf8f
a7a1a37
25c2799
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,75 @@ | ||
| import json | ||
| import os | ||
| import re | ||
| import subprocess | ||
| from pathlib import Path | ||
| from tempfile import TemporaryDirectory | ||
|
|
||
| import pytest | ||
|
|
||
| from optimum.habana.utils import set_seed | ||
|
|
||
|
|
||
| if os.environ.get("GAUDI2_CI", "0") == "1": | ||
| MODEL_OUTPUTS = { | ||
| "bigcode/starcoder": 'def print_hello_world():\n print("Hello World")\n\ndef print_hello_world_twice():\n print_hello_world()\n print_hello_world()\n\ndef print_hello_world_thrice():\n print_hello_world()\n print_hello_world()\n print_hello_world()\n\ndef print_hello_world_four_times():\n print_hello_world()\n print_hello_world()\n print_hello_world()\n ', | ||
| "bigcode/starcoder2-3b": 'def print_hello_world():\n print("Hello World")\n\ndef print_hello_world_with_name(name):\n print("Hello World, " + name)\n\ndef print_hello_world_with_name_and_age(name, age):\n print("Hello World, " + name + ", " + str(age))\n\ndef print_hello_world_with_name_and_age_and_gender(name, age, gender):\n print("Hello', | ||
| "google/gemma-7b": "DeepSpeed is a machine learning framework that enables training of large-scale models on commodity hardware. It is designed to be a drop-in replacement for PyTorch, and it is compatible with the existing PyTorch ecosystem. DeepSpeed is designed to be easy to use, and it provides a number of features that make it easy to train large-scale models.\n\nDeepSpeed is a machine learning framework that enables training of large-scale models on commodity hardware. It is designed to be a drop-in replacement for PyTorch, and", | ||
| "meta-llama/Llama-2-7b-hf": "DeepSpeed is a machine learning framework for deep learning. It is designed to be fast and efficient, while also being easy to use. DeepSpeed is based on the TensorFlow framework, and it uses the TensorFlow library to perform computations.\nDeepSpeed is a deep learning framework that is designed to be fast and efficient. It is based on the TensorFlow library and uses the TensorFlow library to perform computations. DeepSpeed is designed to be easy to use and to provide a high level of flex", | ||
| "mistralai/Mistral-7B-v0.1": "DeepSpeed is a machine learning framework that accelerates training of large models on a single machine or distributed systems. It is designed to be compatible with PyTorch and TensorFlow, and can be used to train models on a single machine or on a distributed system.\n\nDeepSpeed is a machine learning framework that accelerates training of large models on a single machine or distributed systems. It is designed to be compatible with PyTorch and TensorFlow, and can be used to train models on a single machine or on a distributed system", | ||
| "mistralai/Mixtral-8x7B-v0.1": "DeepSpeed is a machine learning framework that enables training of large models on a single machine with a single GPU. It is designed to be easy to use and efficient, and it can be used to train models on a variety of tasks.\n\n## Introduction\n\nDeepSpeed is a machine learning framework that enables training of large models on a single machine with a single GPU. It is designed to be easy to use and efficient, and it can be used to train models on a variety of tasks.\n\n## What is DeepSpeed", | ||
| "Qwen/Qwen2-7B": "DeepSpeed is a machine learning framework that provides a unified interface for training deep learning models. It is designed to be easy to use and to provide high performance on a variety of hardware platforms. DeepSpeed is built on top of PyTorch and TensorFlow, and it supports a wide range of models architectures, including transformer models, convolutional neural networks, and recurrent neural networks.\nDeepSpeed is designed to be easy to use, and it provides a unified interface for training deep learning models. It supports a wide range of model architectures, including", | ||
| } | ||
| else: | ||
| # Functional testing only on G2 onwards | ||
| MODEL_OUTPUTS = [] | ||
|
|
||
|
|
||
| def _test_text_generation( | ||
| model_name: str, | ||
| token: str, | ||
| ): | ||
| set_seed(42) | ||
| command = ["python3"] | ||
| path_to_example_dir = Path(__file__).resolve().parent.parent / "examples" | ||
| env_variables = os.environ.copy() | ||
|
|
||
| command += [ | ||
| f"{path_to_example_dir}/text-generation/run_generation.py ", | ||
| f"--model_name_or_path {model_name}", | ||
| "--use_kv_cache", | ||
| "--use_hpu_graphs", | ||
| "--bf16", | ||
| ] | ||
|
|
||
| with TemporaryDirectory() as tmp_dir: | ||
| command.append(f"--output_dir {tmp_dir}") | ||
| command.append(f"--token {token.value}") | ||
|
|
||
| pattern = re.compile(r"([\"\"].+?[\"\"])|\s") | ||
|
|
||
| command = [x for y in command for x in re.split(pattern, y) if x] | ||
| if "starcoder" in model_name: | ||
| command.append("--prompt") | ||
| command.append("def print_hello_world():") | ||
| print(f"\n\nCommand to test: {' '.join(command)}\n") | ||
| proc = subprocess.run(command, env=env_variables) | ||
|
|
||
| # Ensure the run finished without any issue | ||
| # Use try-except to avoid logging the token if used | ||
| try: | ||
| assert proc.returncode == 0 | ||
| except AssertionError as e: | ||
| if "'--token', 'hf_" in e.args[0]: | ||
| e.args = (f"The following command failed:\n{' '.join(command[:-2])}",) | ||
| raise | ||
|
|
||
| with open(Path(tmp_dir) / "results.json") as fp: | ||
| results = json.load(fp) | ||
|
|
||
| assert results["output"][0][0] == MODEL_OUTPUTS[model_name] | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("model_name", MODEL_OUTPUTS.keys()) | ||
| def test_text_generation_bf16_1x(model_name: str, token: str): | ||
| _test_text_generation(model_name, token) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,6 +9,8 @@ | |
|
|
||
| import pytest | ||
|
|
||
| from optimum.habana.utils import set_seed | ||
|
|
||
| from .test_examples import TIME_PERF_FACTOR | ||
|
|
||
|
|
||
|
|
@@ -19,34 +21,40 @@ | |
| # Gaudi2 CI baselines | ||
| MODELS_TO_TEST = { | ||
| "bf16_1x": [ | ||
| ("bigscience/bloomz-7b1", 1, False, 130.0472971205316), | ||
| ("gpt2-xl", 1, False, 281.8734689674413), | ||
| ("EleutherAI/gpt-j-6b", 1, False, 160.5823842101192), | ||
| ("EleutherAI/gpt-neox-20b", 1, False, 50.67672679310354), | ||
| ("meta-llama/Llama-2-7b-hf", 1, True, 141.25776956002076), | ||
| ("tiiuae/falcon-40b", 1, True, 25.202450111088346), | ||
| ("bigcode/starcoder", 256, True, 6846.575763562658), | ||
| ("Salesforce/codegen2-1B", 1, False, 446.4029486883532), | ||
| ("mosaicml/mpt-30b", 1, False, 36.06464336116623), | ||
| ("mistralai/Mistral-7B-v0.1", 1, True, 130.2172236767782), | ||
| ("mistralai/Mixtral-8x7B-v0.1", 1, False, 23.7931001677926), | ||
| ("microsoft/phi-2", 1, False, 224.72307766211117), | ||
| ("meta-llama/Meta-Llama-3-8B", 1, True, 129), | ||
| ("meta-llama/Llama-2-7b-hf", 512, True, 12808), | ||
| ("meta-llama/Llama-2-7b-hf", 512, False, 8711), # in some cases like TGI, reuse_cache isnt used | ||
| ("stabilityai/stablelm-2-12b", 1, False, 74.8904496532218), | ||
| ("codellama/CodeLlama-34b-hf", 1, True, 32.644), | ||
| ("bigcode/starcoder2-3b", 1, False, 261.07213776344133), | ||
| ("adept/persimmon-8b-base", 4, False, 366.73968820698406), | ||
| ("Qwen/Qwen1.5-7B", 4, False, 490.8621617893209), | ||
| ("google/gemma-7b", 1, False, 109.70751574382221), | ||
| ("state-spaces/mamba-130m-hf", 1536, False, 5385.511100161605), | ||
| ("Deci/DeciLM-7B", 1, False, 120), | ||
| ("Qwen/Qwen2-7B", 512, False, 9669.45787), | ||
| ("Qwen/Qwen1.5-MoE-A2.7B", 1, True, 44.25834541569395), | ||
| ("EleutherAI/gpt-neo-2.7B", 1, False, 257.2476416844122), | ||
| ("facebook/xglm-1.7B", 1, False, 357.46365062825083), | ||
| ("CohereForAI/c4ai-command-r-v01", 1, False, 29.50315234651154), | ||
| ("bigscience/bloomz-7b1", 1, False, 130.0472971205316, False), | ||
| ("gpt2-xl", 1, False, 281.8734689674413, False), | ||
| ("EleutherAI/gpt-j-6b", 1, False, 160.5823842101192, False), | ||
| ("EleutherAI/gpt-neox-20b", 1, False, 50.67672679310354, False), | ||
| ("meta-llama/Llama-2-7b-hf", 1, True, 141.25776956002076, True), | ||
| ("tiiuae/falcon-40b", 1, True, 25.202450111088346, False), | ||
| ( | ||
| "bigcode/starcoder", | ||
| 256, | ||
| True, | ||
| 6846.575763562658, | ||
| False, | ||
| ), # TODO: Enable check_output after model bigcode/starcoder is fixed | ||
| ("Salesforce/codegen2-1B", 1, False, 446.4029486883532, False), | ||
| ("mosaicml/mpt-30b", 1, False, 36.06464336116623, False), | ||
| ("mistralai/Mistral-7B-v0.1", 1, True, 130.2172236767782, True), | ||
| ("mistralai/Mixtral-8x7B-v0.1", 1, False, 23.7931001677926, True), | ||
| ("microsoft/phi-2", 1, False, 224.72307766211117, False), | ||
| ("meta-llama/Meta-Llama-3-8B", 1, True, 129, False), | ||
| ("meta-llama/Llama-2-7b-hf", 512, True, 12808, False), | ||
| ("meta-llama/Llama-2-7b-hf", 512, False, 8711, False), # in some cases like TGI, reuse_cache isnt used | ||
| ("stabilityai/stablelm-2-12b", 1, False, 74.8904496532218, False), | ||
| ("codellama/CodeLlama-34b-hf", 1, True, 32.644, False), | ||
| ("bigcode/starcoder2-3b", 1, False, 261.07213776344133, True), | ||
| ("adept/persimmon-8b-base", 4, False, 366.73968820698406, False), | ||
| ("Qwen/Qwen1.5-7B", 4, False, 490.8621617893209, False), | ||
| ("google/gemma-7b", 1, False, 109.70751574382221, True), | ||
| ("state-spaces/mamba-130m-hf", 1536, False, 5385.511100161605, False), | ||
| ("Deci/DeciLM-7B", 1, False, 120, False), | ||
| ("Qwen/Qwen2-7B", 512, False, 9669.45787, True), | ||
| ("Qwen/Qwen1.5-MoE-A2.7B", 1, True, 44.25834541569395, False), | ||
| ("EleutherAI/gpt-neo-2.7B", 1, False, 257.2476416844122, False), | ||
| ("facebook/xglm-1.7B", 1, False, 357.46365062825083, False), | ||
| ("CohereForAI/c4ai-command-r-v01", 1, False, 29.50315234651154, False), | ||
| ], | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So only the model with "True" will have output tested? If we want to add more output check later on, all we need to do is change this last value to True, and add the values 1st token output to the "MODEL_OUTPUTS" table below?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes that is right |
||
| "fp8": [ | ||
| ("tiiuae/falcon-180B", 4, 950, True, 128, 128, 2506.68), | ||
|
|
@@ -91,41 +99,51 @@ | |
| ("gpt2-xl", 1, False, 51.61471298016438), | ||
| ], | ||
| } | ||
| MODEL_OUTPUTS = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can this be also used for fp8 test?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i have not tried but perhaps it can be a later effort
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah I think we should add it for fp8 output check as well - with fp8, and int4. Maybe for another PR. |
||
| "bigcode/starcoder": 'def print_hello_world():\n print("Hello World")\n\ndef print_hello_world_twice():\n print_hello_world()\n print_hello_world()\n\ndef print_hello_world_thrice():\n print_hello_world()\n print_hello_world()\n print_hello_world()\n\ndef print_hello_world_four_times():\n print_hello_world()\n print_hello_world()\n print_hello_world()\n ', | ||
| "bigcode/starcoder2-3b": 'def print_hello_world():\n print("Hello World")\n\ndef print_hello_world_with_name(name):\n print("Hello World, " + name)\n\ndef print_hello_world_with_name_and_age(name, age):\n print("Hello World, " + name + ", " + str(age))\n\ndef print_hello_world_with_name_and_age_and_gender(name, age, gender):\n print("Hello', | ||
| "google/gemma-7b": "DeepSpeed is a machine learning framework that enables training of large-scale models on commodity hardware. It is designed to be a drop-in replacement for PyTorch, and it is compatible with the existing PyTorch ecosystem. DeepSpeed is designed to be easy to use, and it provides a number of features that make it easy to train large-scale models.\n\nDeepSpeed is a machine learning framework that enables training of large-scale models on commodity hardware. It is designed to be a drop-in replacement for PyTorch, and", | ||
| "meta-llama/Llama-2-7b-hf": "DeepSpeed is a machine learning framework for deep learning. It is designed to be fast and efficient, while also being easy to use. DeepSpeed is based on the TensorFlow framework, and it uses the TensorFlow library to perform computations.\nDeepSpeed is a deep learning framework that is designed to be fast and efficient. It is based on the TensorFlow library and uses the TensorFlow library to perform computations. DeepSpeed is designed to be easy to use and to provide a high level of flex", | ||
| "mistralai/Mistral-7B-v0.1": "DeepSpeed is a machine learning framework that accelerates training of large models on a single machine or distributed systems. It is designed to be compatible with PyTorch and TensorFlow, and can be used to train models on a single machine or on a distributed system.\n\nDeepSpeed is a machine learning framework that accelerates training of large models on a single machine or distributed systems. It is designed to be compatible with PyTorch and TensorFlow, and can be used to train models on a single machine or on a distributed system", | ||
| "mistralai/Mixtral-8x7B-v0.1": "DeepSpeed is a machine learning framework that enables training of large models on a single machine with a single GPU. It is designed to be easy to use and efficient, and it can be used to train models on a variety of tasks.\n\n## Introduction\n\nDeepSpeed is a machine learning framework that enables training of large models on a single machine with a single GPU. It is designed to be easy to use and efficient, and it can be used to train models on a variety of tasks.\n\n## What is DeepSpeed", | ||
| "Qwen/Qwen2-7B": "DeepSpeed is a machine learning framework that provides a suite of toolskits for building and training deep learning models. It is designed to be highly scalable and efficient, and it supports a wide range of deep learning frameworks, including PyTorch, TensorFlow, and MXNet. DeepSpeed is particularly well-suited for training large-scale models on distributed systems, and it provides a number of features that make it easy to use and configure. Some of the key features of DeepSpeed include:\n\n- Distributed training: DeepSpeed supports distributed training on multiple", | ||
| } | ||
| else: | ||
| # Gaudi1 CI baselines | ||
| MODELS_TO_TEST = { | ||
| "bf16_1x": [ | ||
| ("bigscience/bloomz-7b1", 1, False, 41.7555095197846), | ||
| ("gpt2-xl", 1, False, 142.11481820425706), | ||
| ("bigscience/bloomz-7b1", 1, False, 41.7555095197846, False), | ||
| ("gpt2-xl", 1, False, 142.11481820425706, False), | ||
| # TODO: fix OPT 6.7B | ||
| # ("facebook/opt-6.7b", 0.0), | ||
| ("EleutherAI/gpt-j-6b", 1, True, 156.2893125740893), | ||
| ("meta-llama/Llama-2-7b-hf", 1, True, 44.39616259946937), | ||
| ("tiiuae/falcon-7b", 1, True, 44.82870145718665), | ||
| ("bigcode/starcoder", 1, False, 15.945023767901013), | ||
| ("Salesforce/codegen2-1B", 1, False, 155.32071248826423), | ||
| ("mosaicml/mpt-7b", 1, False, 45.45168927038262), | ||
| ("mistralai/Mistral-7B-v0.1", 1, True, 41.21906841459711), | ||
| ("microsoft/phi-2", 1, False, 92.53083167241344), | ||
| ("google/gemma-7b", 1, False, 28.84284625836978), | ||
| ("stabilityai/stablelm-2-12b", 1, False, 26.80858949645992), | ||
| ("Qwen/Qwen1.5-7B", 1, False, 39.29068423087616), | ||
| ("adept/persimmon-8b-base", 1, False, 34.53559807384106), | ||
| ("bigcode/starcoder2-3b", 1, False, 82.09655684566117), | ||
| ("state-spaces/mamba-130m-hf", 224, False, 794.542), | ||
| ("EleutherAI/gpt-j-6b", 1, True, 156.2893125740893, False), | ||
| ("meta-llama/Llama-2-7b-hf", 1, True, 44.39616259946937, False), | ||
| ("tiiuae/falcon-7b", 1, True, 44.82870145718665, False), | ||
| ("bigcode/starcoder", 1, False, 15.945023767901013, False), | ||
| ("Salesforce/codegen2-1B", 1, False, 155.32071248826423, False), | ||
| ("mosaicml/mpt-7b", 1, False, 45.45168927038262, False), | ||
| ("mistralai/Mistral-7B-v0.1", 1, True, 41.21906841459711, False), | ||
| ("microsoft/phi-2", 1, False, 92.53083167241344, False), | ||
| ("google/gemma-7b", 1, False, 28.84284625836978, False), | ||
| ("stabilityai/stablelm-2-12b", 1, False, 26.80858949645992, False), | ||
| ("Qwen/Qwen1.5-7B", 1, False, 39.29068423087616, False), | ||
| ("adept/persimmon-8b-base", 1, False, 34.53559807384106, False), | ||
| ("bigcode/starcoder2-3b", 1, False, 82.09655684566117, False), | ||
| ("state-spaces/mamba-130m-hf", 224, False, 794.542, False), | ||
| ], | ||
| "fp8": [], | ||
| "load_quantized_model_with_autogptq": [], | ||
| "deepspeed": [ | ||
| ("bigscience/bloomz-7b1", 8, 1, 31.994268212011505), | ||
| ("bigscience/bloomz-7b1", 8, 1, 31.994268212011505, False), | ||
| ], | ||
| "torch_compile": [], | ||
| "torch_compile_distributed": [], | ||
| "distributed_tp": [], | ||
| "contrastive_search": [ | ||
| ("gpt2-xl", 1, False, 34.48141280163397), | ||
| ("gpt2-xl", 1, False, 34.48141280163397, False), | ||
| ], | ||
| } | ||
| MODEL_OUTPUTS = {} | ||
|
|
||
|
|
||
| def _test_text_generation( | ||
|
|
@@ -143,6 +161,7 @@ def _test_text_generation( | |
| max_output_tokens: int = 100, | ||
| parallel_strategy: str = None, | ||
| contrastive_search: bool = False, | ||
| check_output: bool = False, | ||
| ): | ||
| command = ["python3"] | ||
| path_to_example_dir = Path(__file__).resolve().parent.parent / "examples" | ||
|
|
@@ -293,7 +312,13 @@ def _test_text_generation( | |
| ) | ||
|
|
||
| command = [x for y in command for x in re.split(pattern, y) if x] | ||
| print(f"\n\nCommand to test: {' '.join(command[:-2])}\n") | ||
| if "starcoder" in model_name and check_output: | ||
| command.append("--prompt") | ||
| command.append("def print_hello_world():") | ||
|
|
||
| set_seed(42) | ||
|
|
||
| print(f"\n\nCommand to test: {' '.join(command)}\n") | ||
| proc = subprocess.run(command, env=env_variables) | ||
|
|
||
| # Ensure the run finished without any issue | ||
|
|
@@ -311,10 +336,24 @@ def _test_text_generation( | |
| # Ensure performance requirements (throughput) are met | ||
| assert results["throughput"] >= (2 - TIME_PERF_FACTOR) * baseline | ||
|
|
||
| # Verify output for 1 HPU, BF16 | ||
| if check_output and model_name in MODEL_OUTPUTS: | ||
| expected_output = MODEL_OUTPUTS[model_name] | ||
| assert results["output"][0][0] == expected_output | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("model_name, batch_size, reuse_cache, baseline", MODELS_TO_TEST["bf16_1x"]) | ||
| def test_text_generation_bf16_1x(model_name: str, baseline: float, batch_size: int, reuse_cache: bool, token: str): | ||
| _test_text_generation(model_name, baseline, token, batch_size, reuse_cache) | ||
| @pytest.mark.parametrize("model_name, batch_size, reuse_cache, baseline, check_output", MODELS_TO_TEST["bf16_1x"]) | ||
| def test_text_generation_bf16_1x( | ||
| model_name: str, baseline: float, batch_size: int, reuse_cache: bool, token: str, check_output: bool | ||
| ): | ||
| _test_text_generation( | ||
| model_name=model_name, | ||
| baseline=baseline, | ||
| token=token, | ||
| batch_size=batch_size, | ||
| reuse_cache=reuse_cache, | ||
| check_output=check_output, | ||
| ) | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@regisss , a common theme I observe in the output (except starcoder/starcoder2) is repeated sentences, not sure if that is some other bug or expected
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not surprised, greedy search tends to produce very repetitive outputs as it only looks for the most likely token to generate in each iteration. Tweaking some generation parameters (for example to penalize more the sequences of tokens that are already part of the output) and/or using sampling (to consider several candidates and not only the most likely one) usually helps to give the model more flexibility and to get more realistic outputs.