Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 52 additions & 12 deletions tests/models/falcon_mamba/test_modeling_falcon_mamba.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,12 @@

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, FalconMambaConfig, is_torch_available
from transformers.testing_utils import (
Expectations,
cleanup,
require_bitsandbytes,
require_torch,
require_torch_accelerator,
require_torch_large_accelerator,
require_torch_multi_accelerator,
require_torch_multi_gpu,
slow,
Expand Down Expand Up @@ -450,15 +453,30 @@ def setUp(self):
self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
self.text = "Hello today"

def test_generation_bf16(self):
model = AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch.bfloat16, device_map="auto")
cleanup(torch_device, gc_collect=True)

def tearDown(self):
cleanup(torch_device, gc_collect=True)

# On T4, get `NotImplementedError: Cannot copy out of meta tensor; no data!`
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might be too little memory or some cpu offloading issues?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I guess too. But if I remove device_map="auto" and add .to(torch_device)

model = AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch.float16).to(torch_device)

the generation could run without OOM.

So it's a bit strange to me why auto would cause problem.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe someone to cc here? It's not a deal breaker here but might be interesting to investigate if someone wants to / has the time.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will open an issue and ping

@require_torch_large_accelerator
def test_generation_fp16(self):
model = AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch.float16, device_map="auto")

inputs = self.tokenizer(self.text, return_tensors="pt").to(torch_device)
out = model.generate(**inputs, max_new_tokens=20, do_sample=False)

EXPECTED_OUTPUTS = Expectations(
{
("cuda", 7): "Hello today I am going to show you how to make a simple and easy to make paper plane.\nStep",
("cuda", 8): 'Hello today Iava,\n\nI am writing to you today to discuss the importance of maintaining a healthy lifestyle',
}
) # fmt: skip
EXPECTED_OUTPUT = EXPECTED_OUTPUTS.get_expectation()

self.assertEqual(
self.tokenizer.batch_decode(out, skip_special_tokens=False)[0],
"Hello today I am going to show you how to make a simple and easy to make paper plane.\nStep",
EXPECTED_OUTPUT,
)

@require_bitsandbytes
Expand All @@ -471,19 +489,19 @@ def test_generation_4bit(self):

self.assertEqual(
self.tokenizer.batch_decode(out, skip_special_tokens=False)[0],
"""Hello today I'm going to talk about the "C" in the "C-I-""",
"Hello today Iava,\n\nI'm sorry to hear that you're having trouble with the ",
)

def test_generation_torch_compile(self):
model = AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch.bfloat16).to(torch_device)
model = AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch.float16).to(torch_device)
model = torch.compile(model)

inputs = self.tokenizer(self.text, return_tensors="pt").to(torch_device)
out = model.generate(**inputs, max_new_tokens=20, do_sample=False)

self.assertEqual(
self.tokenizer.batch_decode(out, skip_special_tokens=False)[0],
"Hello today I am going to show you how to make a simple and easy to make paper plane.\nStep",
"Hello today Iava,\n\nI am writing to you today to discuss the importance of maintaining a healthy lifestyle",
)

def test_batched_generation(self):
Expand All @@ -493,13 +511,22 @@ def test_batched_generation(self):

texts = ["Hello today", "Hello my name is Younes and today"]

EXPECTED_OUTPUT = [
"Hello today I'm going to show you how to make a 3D model of a house.\n",
"Hello my name is Younes and today I will be talking about the topic of “The importance of the internet in our life”.\n",
]
EXPECTED_OUTPUTS = Expectations(
{
("cuda", 7): [
'Hello today I will be talking about the “Theory of Relativity” by Albert Einstein.\nThe',
'Hello my name is Younes and today I will be talking about the importance of the internet in our lives.\nThe internet is a global',
],
("cuda", 8): [
'Hello today I am going to talk about the “Theory of Relativity” by Albert Einstein.\n',
'Hello my name is Younes and today I will be talking about the importance of the internet in our lives.\nThe internet is a global',
],
}
) # fmt: skip
EXPECTED_OUTPUT = EXPECTED_OUTPUTS.get_expectation()

inputs = tok(texts, return_tensors="pt", padding=True, return_token_type_ids=False).to(torch_device)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map=0, torch_dtype=torch.bfloat16)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map=0, torch_dtype=torch.float16)

out = model.generate(**inputs, max_new_tokens=20)
out = tok.batch_decode(out, skip_special_tokens=True)
Expand All @@ -514,14 +541,27 @@ def test_batched_generation(self):
out = model.generate(**inputs, max_new_tokens=20)
out = tok.batch_decode(out, skip_special_tokens=True)

EXPECTED_OUTPUTS = Expectations(
{
("cuda", 7): [
' I will be talking about the “Theory of Relativity” by Albert Einstein.\nThe',
' I will be talking about the importance of the internet in our lives.\nThe internet is a global',
],
("cuda", 8): [
' I am going to talk about the “Theory of Relativity” by Albert Einstein.\n',
' I will be talking about the importance of the internet in our lives.\nThe internet is a global'
],
Comment on lines +546 to +553
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gante I guess it's normal that, with inputs_embeds, we don't have the prompt included, right?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Imo, that doesn't seem correct to me. It would be weird to expect different behaviour here since we generate from the same "prompt". That might be a regression somewhere.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not expert (that is why I ping @gante ), but I kind think it's normal. For the prompt part, we only pass embedding not the token ids. And for those part, we can't recover the token ids from the embedding. That is why it only gives the part that are generated.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be nice to know if the original commit worked on this test. Or at least if on input embeds the same issues persisted (no prompt).

Both behaviours are justified imo.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's failing when the test is written 😢

(That's why I always it's important to use run-slow on PR )

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right, my bad, move too fast to get the wrong results.

It's on

https://huggingface.slack.com/archives/C06LR9PQA00/p1725987868413609?thread_ts=1725987848.555519&cid=C06LR9PQA00

I will check the commit the day before it to see what happened

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I confirmed that this test is failing when the test is added on 2025/06/19 - I checkout to the commit, run the test.

(around that time, there are several CIs triggered manually on different commits, so hard to check on slack channels)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sry to be so picky but is it also having failures on (at least) no prefix being returned 👀 If yes, I think this is fine to merge then.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, same failing reason. Not picky, it's fine. I am also happy to wait joao's response. No urgent to merge at all :-)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gentilly ping @gante

}
) # fmt: skip
EXPECTED_OUTPUT = EXPECTED_OUTPUTS.get_expectation()
self.assertListEqual(out, EXPECTED_OUTPUT)

@require_torch_multi_accelerator
def test_training_kernel(self):
model_id = "tiiuae/falcon-mamba-7b"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.float16)
tokenizer.pad_token_id = tokenizer.eos_token_id

text = "Hello today"
Expand Down