Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
e6a2b81
fix mistral and mixtral
younesbelkada Mar 22, 2024
3a628af
add pdb
younesbelkada Mar 22, 2024
c7e970f
fix mixtral tesst
younesbelkada Mar 22, 2024
66c2e5d
fix
younesbelkada Mar 22, 2024
361e7cc
fix mistral ?
younesbelkada Mar 22, 2024
55ccbca
add fix gemma
younesbelkada Mar 22, 2024
3c0dc50
fix mistral
younesbelkada Mar 22, 2024
f9ba37c
fix
younesbelkada Mar 22, 2024
73dafde
test
younesbelkada Mar 22, 2024
22585dd
anoter test
younesbelkada Mar 22, 2024
d60ca74
fix
younesbelkada Mar 22, 2024
6983ba9
fix
younesbelkada Mar 22, 2024
0f90b8a
fix mistral tests
younesbelkada Mar 22, 2024
22d1b77
fix them again
younesbelkada Mar 22, 2024
4a4a9e7
final fixes for mistral
younesbelkada Mar 22, 2024
04213a5
fix padding right
younesbelkada Mar 22, 2024
374d23c
fix whipser fa2
younesbelkada Mar 22, 2024
79545e8
fix
younesbelkada Mar 22, 2024
9c5f47a
fix
younesbelkada Mar 22, 2024
58cd487
fix gemma
younesbelkada Mar 22, 2024
0fd9b02
test
younesbelkada Mar 22, 2024
518b6a2
fix llama
younesbelkada Mar 22, 2024
2bd652f
fix
younesbelkada Mar 22, 2024
25a299d
fix
younesbelkada Mar 22, 2024
282a6d3
fix llama gemma
younesbelkada Mar 22, 2024
21a0ddb
add class attribute
younesbelkada Mar 22, 2024
e2c035e
fix CI
younesbelkada Mar 22, 2024
b1e3a35
clarify whisper
younesbelkada Mar 22, 2024
86e962e
compute_capability
ydshieh Mar 27, 2024
e208db1
rename names in some comments
ydshieh Mar 27, 2024
ce2cfa4
Add # fmt: skip
ydshieh Mar 27, 2024
d0c9f7c
make style
ydshieh Mar 27, 2024
bd41a3a
Update tests/models/mistral/test_modeling_mistral.py
ydshieh Mar 29, 2024
751a62c
update
ydshieh Mar 29, 2024
71debdb
update
ydshieh Apr 3, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 80 additions & 61 deletions tests/models/gemma/test_modeling_gemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

from transformers import AutoModelForCausalLM, AutoTokenizer, GemmaConfig, is_torch_available
from transformers.testing_utils import (
is_flaky,
require_bitsandbytes,
require_flash_attn,
require_read_token,
Expand Down Expand Up @@ -379,40 +380,6 @@ def test_save_load_fast_init_from_base(self):
def test_past_key_values_format(self):
pass

@require_flash_attn
@require_torch_gpu
@pytest.mark.flash_attn_test
@slow
def test_flash_attn_2_generate_padding_right(self):
import torch

for model_class in self.all_generative_model_classes:
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
model = model_class(config)

with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
torch_device
)

dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device)
dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device)

model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False)

model = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
attn_implementation="flash_attention_2",
low_cpu_mem_usage=True,
).to(torch_device)

with self.assertRaises(ValueError):
_ = model.generate(
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
)

@require_flash_attn
@require_torch_gpu
@pytest.mark.flash_attn_test
Expand Down Expand Up @@ -500,6 +467,7 @@ def test_sdpa_equivalence(self):
@require_flash_attn
@require_torch_gpu
@pytest.mark.flash_attn_test
@is_flaky
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the comment here, my response is

I will open an issue but leave @younesbelkada to fill more details on the issue page.

@slow
def test_flash_attn_2_equivalence(self):
for model_class in self.all_model_classes:
Expand Down Expand Up @@ -531,12 +499,21 @@ def test_flash_attn_2_equivalence(self):
assert torch.allclose(logits_fa, logits, atol=3e-3)


@require_torch_gpu
@slow
@require_read_token
@require_torch_gpu
class GemmaIntegrationTest(unittest.TestCase):
input_text = ["Hello I am doing", "Hi today"]
# This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
# Depending on the hardware we get different logits / generations
cuda_compute_capability_major_version = None

@classmethod
def setUpClass(cls):
if is_torch_available() and torch.cuda.is_available():
# 8 is for A100 / A10 and 7 for T4
cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0]

@require_read_token
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be a better name, cc @younesbelkada

See #29805 (comment)

def test_model_2b_fp32(self):
model_id = "google/gemma-2b"
EXPECTED_TEXTS = [
Expand All @@ -554,6 +531,7 @@ def test_model_2b_fp32(self):

self.assertEqual(output_text, EXPECTED_TEXTS)

@require_read_token
def test_model_2b_fp16(self):
model_id = "google/gemma-2b"
EXPECTED_TEXTS = [
Expand All @@ -573,6 +551,7 @@ def test_model_2b_fp16(self):

self.assertEqual(output_text, EXPECTED_TEXTS)

@require_read_token
def test_model_2b_fp16_static_cache(self):
model_id = "google/gemma-2b"
EXPECTED_TEXTS = [
Expand All @@ -594,12 +573,19 @@ def test_model_2b_fp16_static_cache(self):

self.assertEqual(output_text, EXPECTED_TEXTS)

@require_read_token
def test_model_2b_bf16(self):
model_id = "google/gemma-2b"
EXPECTED_TEXTS = [
"Hello I am doing a project on the 1990s and I need to know what the most popular music",
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Khichdi",
]
EXPECTED_TEXTS = {
7: [
"Hello I am doing a project on the 1990s and I need to know what the most popular music",
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Khichdi",
],
8: [
"Hello I am doing a project on the 1990s and I need to know what the most popular music",
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
],
}

model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to(
torch_device
Expand All @@ -611,14 +597,21 @@ def test_model_2b_bf16(self):
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
output_text = tokenizer.batch_decode(output, skip_special_tokens=True)

self.assertEqual(output_text, EXPECTED_TEXTS)
self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version])

@require_read_token
def test_model_2b_eager(self):
model_id = "google/gemma-2b"
EXPECTED_TEXTS = [
"Hello I am doing a project on the 1990s and I am looking for some information on the ",
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
]
EXPECTED_TEXTS = {
7: [
"Hello I am doing a project on the 1990s and I am looking for some information on the ",
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
],
8: [
"Hello I am doing a project on the 1990s and I need to know what the most popular music",
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
],
}

model = AutoModelForCausalLM.from_pretrained(
model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="eager"
Expand All @@ -631,15 +624,22 @@ def test_model_2b_eager(self):
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
output_text = tokenizer.batch_decode(output, skip_special_tokens=True)

self.assertEqual(output_text, EXPECTED_TEXTS)
self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version])

@require_torch_sdpa
@require_read_token
def test_model_2b_sdpa(self):
model_id = "google/gemma-2b"
EXPECTED_TEXTS = [
"Hello I am doing a project on the 1990s and I need to know what the most popular music",
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Khichdi",
]
EXPECTED_TEXTS = {
7: [
"Hello I am doing a project on the 1990s and I need to know what the most popular music",
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Khichdi",
],
8: [
"Hello I am doing a project on the 1990s and I need to know what the most popular music",
"Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
],
}

model = AutoModelForCausalLM.from_pretrained(
model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="sdpa"
Expand All @@ -652,10 +652,11 @@ def test_model_2b_sdpa(self):
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
output_text = tokenizer.batch_decode(output, skip_special_tokens=True)

self.assertEqual(output_text, EXPECTED_TEXTS)
self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version])

@pytest.mark.flash_attn_test
@require_flash_attn
@require_read_token
def test_model_2b_flash_attn(self):
model_id = "google/gemma-2b"
EXPECTED_TEXTS = [
Expand All @@ -677,6 +678,7 @@ def test_model_2b_flash_attn(self):
self.assertEqual(output_text, EXPECTED_TEXTS)

@require_bitsandbytes
@require_read_token
def test_model_2b_4bit(self):
model_id = "google/gemma-2b"
EXPECTED_TEXTS = [
Expand All @@ -695,6 +697,7 @@ def test_model_2b_4bit(self):
self.assertEqual(output_text, EXPECTED_TEXTS)

@unittest.skip("The test will not fit our CI runners")
@require_read_token
def test_model_7b_fp32(self):
model_id = "google/gemma-7b"
EXPECTED_TEXTS = [
Expand All @@ -712,6 +715,7 @@ def test_model_7b_fp32(self):

self.assertEqual(output_text, EXPECTED_TEXTS)

@require_read_token
def test_model_7b_fp16(self):
model_id = "google/gemma-7b"
EXPECTED_TEXTS = [
Expand All @@ -731,12 +735,19 @@ def test_model_7b_fp16(self):

self.assertEqual(output_text, EXPECTED_TEXTS)

@require_read_token
def test_model_7b_bf16(self):
model_id = "google/gemma-7b"
EXPECTED_TEXTS = [
"""Hello I am doing a project on a 1991 240sx and I am trying to find""",
"Hi today I am going to show you how to make a very simple and easy to make a very simple and",
]
EXPECTED_TEXTS = {
7: [
"""Hello I am doing a project on a 1991 240sx and I am trying to find""",
"Hi today I am going to show you how to make a very simple and easy to make a very simple and",
],
8: [
"Hello I am doing a project for my school and I am trying to make a program that will read a .txt file",
"Hi today I am going to show you how to make a very simple and easy to make a very simple and",
],
}

model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to(
torch_device
Expand All @@ -748,8 +759,9 @@ def test_model_7b_bf16(self):
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
output_text = tokenizer.batch_decode(output, skip_special_tokens=True)

self.assertEqual(output_text, EXPECTED_TEXTS)
self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version])

@require_read_token
def test_model_7b_fp16_static_cache(self):
model_id = "google/gemma-7b"
EXPECTED_TEXTS = [
Expand All @@ -772,12 +784,19 @@ def test_model_7b_fp16_static_cache(self):
self.assertEqual(output_text, EXPECTED_TEXTS)

@require_bitsandbytes
@require_read_token
def test_model_7b_4bit(self):
model_id = "google/gemma-7b"
EXPECTED_TEXTS = [
"Hello I am doing a project for my school and I am trying to make a program that will take a number and then",
"""Hi today I am going to talk about the new update for the game called "The new update" and I""",
]
EXPECTED_TEXTS = {
7: [
"Hello I am doing a project for my school and I am trying to make a program that will take a number and then",
"""Hi today I am going to talk about the new update for the game called "The new update" and I""",
],
8: [
"Hello I am doing a project for my school and I am trying to make a program that will take a number and then",
"Hi today I am going to talk about the best way to get rid of acne. miniaturing is a very",
],
}

model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, load_in_4bit=True)

Expand All @@ -787,4 +806,4 @@ def test_model_7b_4bit(self):
output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
output_text = tokenizer.batch_decode(output, skip_special_tokens=True)

self.assertEqual(output_text, EXPECTED_TEXTS)
self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version])
34 changes: 27 additions & 7 deletions tests/models/llama/test_modeling_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -597,8 +597,18 @@ def test_new_cache_format(self, num_beams, do_sample):
pass


@require_torch
@require_torch_gpu
class LlamaIntegrationTest(unittest.TestCase):
# This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
# Depending on the hardware we get different logits / generations
cuda_compute_capability_major_version = None

@classmethod
def setUpClass(cls):
if is_torch_available() and torch.cuda.is_available():
# 8 is for A100 / A10 and 7 for T4
cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0]

@unittest.skip("Logits are not exactly the same, once we fix the instabalities somehow, will update!")
@slow
def test_model_7b_logits(self):
Expand Down Expand Up @@ -675,16 +685,25 @@ def test_model_13b_greedy_generation(self):
@require_read_token
def test_compile_static_cache(self):
NUM_TOKENS_TO_GENERATE = 40
EXPECTED_TEXT_COMPLETION = [
"Simply put, the theory of relativity states that 1) the speed of light is constant, 2) the speed of light is the same for all observers, and 3) the laws of physics are the same for all observers.",
"My favorite all time favorite condiment is ketchup. I love it on everything. I love it on my eggs, my fries, my chicken, my burgers, my hot dogs, my sandwiches, my salads, my p",
]
EXPECTED_TEXT_COMPLETION = {
7: [
"Simply put, the theory of relativity states that 1) the speed of light is constant, 2) the speed of light is the same for all observers, and 3) the laws of physics are the same for all observers.",
Comment on lines 686 to +690
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This fails on A100

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

on the A10 runners with the transformers-all-latest-gpu docker image it passed, might be an env issue?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes the ref is probably good for A10G, but not A100

"My favorite all time favorite condiment is ketchup. I love it on everything. I love it on my eggs, my fries, my chicken, my burgers, my hot dogs, my sandwiches, my salads, my p",
],
8: [
"Simply put, the theory of relativity states that 1) the speed of light is the same for all observers, and 2) the laws of physics are the same for all observers.\nThe first part of the theory of relativity",
"My favorite all time favorite condiment is ketchup. I love it on everything. I love it on my eggs, my fries, my chicken, my burgers, my hot dogs, my sandwiches, my salads, my p",
],
}

prompts = [
"Simply put, the theory of relativity states that ",
"My favorite all time favorite condiment is ketchup.",
]
tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", pad_token="</s>", padding_side="right")
model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", device_map="sequential")
model = LlamaForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf", device_map="sequential", torch_dtype=torch.float16
)
inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)

def decode_one_tokens(model, cur_token, input_pos, cache_position):
Expand Down Expand Up @@ -718,7 +737,7 @@ def decode_one_tokens(model, cur_token, input_pos, cache_position):
cache_position += 1

text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
self.assertEqual(EXPECTED_TEXT_COMPLETION[self.cuda_compute_capability_major_version], text)


@require_torch
Expand Down Expand Up @@ -763,6 +782,7 @@ def main():

@require_torch_accelerator
@slow
@unittest.skip("Model is too large")
def test_model_7b_logits(self):
model = LlamaForCausalLM.from_pretrained("codellama/CodeLlama-7b-hf").to(torch_device)
tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")
Expand Down
Loading