-
Notifications
You must be signed in to change notification settings - Fork 31.9k
Fix slow tests for important models to be compatible with A10 runners #29905
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
e6a2b81
3a628af
c7e970f
66c2e5d
361e7cc
55ccbca
3c0dc50
f9ba37c
73dafde
22585dd
d60ca74
6983ba9
0f90b8a
22d1b77
4a4a9e7
04213a5
374d23c
79545e8
9c5f47a
58cd487
0fd9b02
518b6a2
2bd652f
25a299d
282a6d3
21a0ddb
e2c035e
b1e3a35
86e962e
e208db1
ce2cfa4
d0c9f7c
bd41a3a
751a62c
71debdb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,7 @@ | |
|
|
||
| from transformers import AutoModelForCausalLM, AutoTokenizer, GemmaConfig, is_torch_available | ||
| from transformers.testing_utils import ( | ||
| is_flaky, | ||
| require_bitsandbytes, | ||
| require_flash_attn, | ||
| require_read_token, | ||
|
|
@@ -379,40 +380,6 @@ def test_save_load_fast_init_from_base(self): | |
| def test_past_key_values_format(self): | ||
| pass | ||
|
|
||
| @require_flash_attn | ||
| @require_torch_gpu | ||
| @pytest.mark.flash_attn_test | ||
| @slow | ||
| def test_flash_attn_2_generate_padding_right(self): | ||
| import torch | ||
|
|
||
| for model_class in self.all_generative_model_classes: | ||
| config, _ = self.model_tester.prepare_config_and_inputs_for_common() | ||
| model = model_class(config) | ||
|
|
||
| with tempfile.TemporaryDirectory() as tmpdirname: | ||
| model.save_pretrained(tmpdirname) | ||
| model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( | ||
| torch_device | ||
| ) | ||
|
|
||
| dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device) | ||
| dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device) | ||
|
|
||
| model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False) | ||
|
|
||
| model = model_class.from_pretrained( | ||
| tmpdirname, | ||
| torch_dtype=torch.float16, | ||
| attn_implementation="flash_attention_2", | ||
| low_cpu_mem_usage=True, | ||
| ).to(torch_device) | ||
|
|
||
| with self.assertRaises(ValueError): | ||
| _ = model.generate( | ||
| dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False | ||
| ) | ||
|
|
||
| @require_flash_attn | ||
| @require_torch_gpu | ||
| @pytest.mark.flash_attn_test | ||
|
|
@@ -500,6 +467,7 @@ def test_sdpa_equivalence(self): | |
| @require_flash_attn | ||
| @require_torch_gpu | ||
| @pytest.mark.flash_attn_test | ||
| @is_flaky | ||
| @slow | ||
| def test_flash_attn_2_equivalence(self): | ||
| for model_class in self.all_model_classes: | ||
|
|
@@ -531,12 +499,21 @@ def test_flash_attn_2_equivalence(self): | |
| assert torch.allclose(logits_fa, logits, atol=3e-3) | ||
|
|
||
|
|
||
| @require_torch_gpu | ||
| @slow | ||
| @require_read_token | ||
| @require_torch_gpu | ||
| class GemmaIntegrationTest(unittest.TestCase): | ||
| input_text = ["Hello I am doing", "Hi today"] | ||
| # This variable is used to determine which CUDA device are we using for our runners (A10 or T4) | ||
| # Depending on the hardware we get different logits / generations | ||
| cuda_compute_capability_major_version = None | ||
|
|
||
| @classmethod | ||
| def setUpClass(cls): | ||
| if is_torch_available() and torch.cuda.is_available(): | ||
| # 8 is for A100 / A10 and 7 for T4 | ||
| cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0] | ||
|
|
||
| @require_read_token | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be a better name, cc @younesbelkada See #29805 (comment) |
||
| def test_model_2b_fp32(self): | ||
| model_id = "google/gemma-2b" | ||
| EXPECTED_TEXTS = [ | ||
|
|
@@ -554,6 +531,7 @@ def test_model_2b_fp32(self): | |
|
|
||
| self.assertEqual(output_text, EXPECTED_TEXTS) | ||
|
|
||
| @require_read_token | ||
| def test_model_2b_fp16(self): | ||
| model_id = "google/gemma-2b" | ||
| EXPECTED_TEXTS = [ | ||
|
|
@@ -573,6 +551,7 @@ def test_model_2b_fp16(self): | |
|
|
||
| self.assertEqual(output_text, EXPECTED_TEXTS) | ||
|
|
||
| @require_read_token | ||
| def test_model_2b_fp16_static_cache(self): | ||
| model_id = "google/gemma-2b" | ||
| EXPECTED_TEXTS = [ | ||
|
|
@@ -594,12 +573,19 @@ def test_model_2b_fp16_static_cache(self): | |
|
|
||
| self.assertEqual(output_text, EXPECTED_TEXTS) | ||
|
|
||
| @require_read_token | ||
| def test_model_2b_bf16(self): | ||
| model_id = "google/gemma-2b" | ||
| EXPECTED_TEXTS = [ | ||
| "Hello I am doing a project on the 1990s and I need to know what the most popular music", | ||
| "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Khichdi", | ||
| ] | ||
| EXPECTED_TEXTS = { | ||
| 7: [ | ||
| "Hello I am doing a project on the 1990s and I need to know what the most popular music", | ||
| "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Khichdi", | ||
| ], | ||
| 8: [ | ||
| "Hello I am doing a project on the 1990s and I need to know what the most popular music", | ||
| "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat", | ||
| ], | ||
| } | ||
|
|
||
| model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to( | ||
| torch_device | ||
|
|
@@ -611,14 +597,21 @@ def test_model_2b_bf16(self): | |
| output = model.generate(**inputs, max_new_tokens=20, do_sample=False) | ||
| output_text = tokenizer.batch_decode(output, skip_special_tokens=True) | ||
|
|
||
| self.assertEqual(output_text, EXPECTED_TEXTS) | ||
| self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version]) | ||
|
|
||
| @require_read_token | ||
| def test_model_2b_eager(self): | ||
| model_id = "google/gemma-2b" | ||
| EXPECTED_TEXTS = [ | ||
| "Hello I am doing a project on the 1990s and I am looking for some information on the ", | ||
| "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat", | ||
| ] | ||
| EXPECTED_TEXTS = { | ||
| 7: [ | ||
| "Hello I am doing a project on the 1990s and I am looking for some information on the ", | ||
| "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat", | ||
| ], | ||
| 8: [ | ||
| "Hello I am doing a project on the 1990s and I need to know what the most popular music", | ||
| "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat", | ||
| ], | ||
| } | ||
|
|
||
| model = AutoModelForCausalLM.from_pretrained( | ||
| model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="eager" | ||
|
|
@@ -631,15 +624,22 @@ def test_model_2b_eager(self): | |
| output = model.generate(**inputs, max_new_tokens=20, do_sample=False) | ||
| output_text = tokenizer.batch_decode(output, skip_special_tokens=True) | ||
|
|
||
| self.assertEqual(output_text, EXPECTED_TEXTS) | ||
| self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version]) | ||
|
|
||
| @require_torch_sdpa | ||
| @require_read_token | ||
| def test_model_2b_sdpa(self): | ||
| model_id = "google/gemma-2b" | ||
| EXPECTED_TEXTS = [ | ||
| "Hello I am doing a project on the 1990s and I need to know what the most popular music", | ||
| "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Khichdi", | ||
| ] | ||
| EXPECTED_TEXTS = { | ||
| 7: [ | ||
| "Hello I am doing a project on the 1990s and I need to know what the most popular music", | ||
| "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Khichdi", | ||
| ], | ||
| 8: [ | ||
| "Hello I am doing a project on the 1990s and I need to know what the most popular music", | ||
| "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat", | ||
| ], | ||
| } | ||
|
|
||
| model = AutoModelForCausalLM.from_pretrained( | ||
| model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="sdpa" | ||
|
|
@@ -652,10 +652,11 @@ def test_model_2b_sdpa(self): | |
| output = model.generate(**inputs, max_new_tokens=20, do_sample=False) | ||
| output_text = tokenizer.batch_decode(output, skip_special_tokens=True) | ||
|
|
||
| self.assertEqual(output_text, EXPECTED_TEXTS) | ||
| self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version]) | ||
|
|
||
| @pytest.mark.flash_attn_test | ||
| @require_flash_attn | ||
| @require_read_token | ||
| def test_model_2b_flash_attn(self): | ||
| model_id = "google/gemma-2b" | ||
| EXPECTED_TEXTS = [ | ||
|
|
@@ -677,6 +678,7 @@ def test_model_2b_flash_attn(self): | |
| self.assertEqual(output_text, EXPECTED_TEXTS) | ||
|
|
||
| @require_bitsandbytes | ||
| @require_read_token | ||
| def test_model_2b_4bit(self): | ||
| model_id = "google/gemma-2b" | ||
| EXPECTED_TEXTS = [ | ||
|
|
@@ -695,6 +697,7 @@ def test_model_2b_4bit(self): | |
| self.assertEqual(output_text, EXPECTED_TEXTS) | ||
|
|
||
| @unittest.skip("The test will not fit our CI runners") | ||
| @require_read_token | ||
| def test_model_7b_fp32(self): | ||
| model_id = "google/gemma-7b" | ||
| EXPECTED_TEXTS = [ | ||
|
|
@@ -712,6 +715,7 @@ def test_model_7b_fp32(self): | |
|
|
||
| self.assertEqual(output_text, EXPECTED_TEXTS) | ||
|
|
||
| @require_read_token | ||
| def test_model_7b_fp16(self): | ||
| model_id = "google/gemma-7b" | ||
| EXPECTED_TEXTS = [ | ||
|
|
@@ -731,12 +735,19 @@ def test_model_7b_fp16(self): | |
|
|
||
| self.assertEqual(output_text, EXPECTED_TEXTS) | ||
|
|
||
| @require_read_token | ||
| def test_model_7b_bf16(self): | ||
| model_id = "google/gemma-7b" | ||
| EXPECTED_TEXTS = [ | ||
| """Hello I am doing a project on a 1991 240sx and I am trying to find""", | ||
| "Hi today I am going to show you how to make a very simple and easy to make a very simple and", | ||
| ] | ||
| EXPECTED_TEXTS = { | ||
| 7: [ | ||
| """Hello I am doing a project on a 1991 240sx and I am trying to find""", | ||
| "Hi today I am going to show you how to make a very simple and easy to make a very simple and", | ||
| ], | ||
| 8: [ | ||
| "Hello I am doing a project for my school and I am trying to make a program that will read a .txt file", | ||
| "Hi today I am going to show you how to make a very simple and easy to make a very simple and", | ||
| ], | ||
| } | ||
|
|
||
| model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to( | ||
| torch_device | ||
|
|
@@ -748,8 +759,9 @@ def test_model_7b_bf16(self): | |
| output = model.generate(**inputs, max_new_tokens=20, do_sample=False) | ||
| output_text = tokenizer.batch_decode(output, skip_special_tokens=True) | ||
|
|
||
| self.assertEqual(output_text, EXPECTED_TEXTS) | ||
| self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version]) | ||
|
|
||
| @require_read_token | ||
| def test_model_7b_fp16_static_cache(self): | ||
| model_id = "google/gemma-7b" | ||
| EXPECTED_TEXTS = [ | ||
|
|
@@ -772,12 +784,19 @@ def test_model_7b_fp16_static_cache(self): | |
| self.assertEqual(output_text, EXPECTED_TEXTS) | ||
|
|
||
| @require_bitsandbytes | ||
| @require_read_token | ||
| def test_model_7b_4bit(self): | ||
| model_id = "google/gemma-7b" | ||
| EXPECTED_TEXTS = [ | ||
| "Hello I am doing a project for my school and I am trying to make a program that will take a number and then", | ||
| """Hi today I am going to talk about the new update for the game called "The new update" and I""", | ||
| ] | ||
| EXPECTED_TEXTS = { | ||
| 7: [ | ||
| "Hello I am doing a project for my school and I am trying to make a program that will take a number and then", | ||
| """Hi today I am going to talk about the new update for the game called "The new update" and I""", | ||
| ], | ||
| 8: [ | ||
| "Hello I am doing a project for my school and I am trying to make a program that will take a number and then", | ||
| "Hi today I am going to talk about the best way to get rid of acne. miniaturing is a very", | ||
| ], | ||
| } | ||
|
|
||
| model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, load_in_4bit=True) | ||
|
|
||
|
|
@@ -787,4 +806,4 @@ def test_model_7b_4bit(self): | |
| output = model.generate(**inputs, max_new_tokens=20, do_sample=False) | ||
| output_text = tokenizer.batch_decode(output, skip_special_tokens=True) | ||
|
|
||
| self.assertEqual(output_text, EXPECTED_TEXTS) | ||
| self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version]) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -597,8 +597,18 @@ def test_new_cache_format(self, num_beams, do_sample): | |
| pass | ||
|
|
||
|
|
||
| @require_torch | ||
| @require_torch_gpu | ||
| class LlamaIntegrationTest(unittest.TestCase): | ||
| # This variable is used to determine which CUDA device are we using for our runners (A10 or T4) | ||
| # Depending on the hardware we get different logits / generations | ||
| cuda_compute_capability_major_version = None | ||
|
|
||
| @classmethod | ||
| def setUpClass(cls): | ||
| if is_torch_available() and torch.cuda.is_available(): | ||
| # 8 is for A100 / A10 and 7 for T4 | ||
| cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0] | ||
|
|
||
| @unittest.skip("Logits are not exactly the same, once we fix the instabalities somehow, will update!") | ||
| @slow | ||
| def test_model_7b_logits(self): | ||
|
|
@@ -675,16 +685,25 @@ def test_model_13b_greedy_generation(self): | |
| @require_read_token | ||
| def test_compile_static_cache(self): | ||
| NUM_TOKENS_TO_GENERATE = 40 | ||
| EXPECTED_TEXT_COMPLETION = [ | ||
| "Simply put, the theory of relativity states that 1) the speed of light is constant, 2) the speed of light is the same for all observers, and 3) the laws of physics are the same for all observers.", | ||
| "My favorite all time favorite condiment is ketchup. I love it on everything. I love it on my eggs, my fries, my chicken, my burgers, my hot dogs, my sandwiches, my salads, my p", | ||
| ] | ||
| EXPECTED_TEXT_COMPLETION = { | ||
| 7: [ | ||
| "Simply put, the theory of relativity states that 1) the speed of light is constant, 2) the speed of light is the same for all observers, and 3) the laws of physics are the same for all observers.", | ||
|
Comment on lines
686
to
+690
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This fails on A100
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. on the A10 runners with the transformers-all-latest-gpu docker image it passed, might be an env issue?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes the ref is probably good for A10G, but not A100 |
||
| "My favorite all time favorite condiment is ketchup. I love it on everything. I love it on my eggs, my fries, my chicken, my burgers, my hot dogs, my sandwiches, my salads, my p", | ||
| ], | ||
| 8: [ | ||
| "Simply put, the theory of relativity states that 1) the speed of light is the same for all observers, and 2) the laws of physics are the same for all observers.\nThe first part of the theory of relativity", | ||
| "My favorite all time favorite condiment is ketchup. I love it on everything. I love it on my eggs, my fries, my chicken, my burgers, my hot dogs, my sandwiches, my salads, my p", | ||
| ], | ||
| } | ||
|
|
||
| prompts = [ | ||
| "Simply put, the theory of relativity states that ", | ||
| "My favorite all time favorite condiment is ketchup.", | ||
| ] | ||
| tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", pad_token="</s>", padding_side="right") | ||
| model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", device_map="sequential") | ||
| model = LlamaForCausalLM.from_pretrained( | ||
| "meta-llama/Llama-2-7b-hf", device_map="sequential", torch_dtype=torch.float16 | ||
| ) | ||
| inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device) | ||
|
|
||
| def decode_one_tokens(model, cur_token, input_pos, cache_position): | ||
|
|
@@ -718,7 +737,7 @@ def decode_one_tokens(model, cur_token, input_pos, cache_position): | |
| cache_position += 1 | ||
|
|
||
| text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) | ||
| self.assertEqual(EXPECTED_TEXT_COMPLETION, text) | ||
| self.assertEqual(EXPECTED_TEXT_COMPLETION[self.cuda_compute_capability_major_version], text) | ||
|
|
||
|
|
||
| @require_torch | ||
|
|
@@ -763,6 +782,7 @@ def main(): | |
|
|
||
| @require_torch_accelerator | ||
| @slow | ||
| @unittest.skip("Model is too large") | ||
| def test_model_7b_logits(self): | ||
| model = LlamaForCausalLM.from_pretrained("codellama/CodeLlama-7b-hf").to(torch_device) | ||
| tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-hf") | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For the comment here, my response is