Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 3 additions & 24 deletions tests/models/mistral/test_modeling_mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -526,7 +526,7 @@ def test_model_7b_logits(self):
# Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s,
# considering differences in hardware processing and potential deviations in output.
EXPECTED_SLICE = {
7: torch.tensor([-5.8781, -5.8616, -0.1052, -4.7200, -5.8781, -5.8774, -5.8773, -5.8777, -5.8781, -5.8780, -5.8781, -5.8779, -1.0787, 1.7583, -5.8779, -5.8780, -5.8783, -5.8778, -5.8776, -5.8781, -5.8784, -5.8778, -5.8778, -5.8777, -5.8779, -5.8778, -5.8776, -5.8780, -5.8779, -5.8781]),
7: torch.tensor([-5.8828, -5.8633, -0.1042, -4.7266, -5.8828, -5.8789, -5.8789, -5.8828, -5.8828, -5.8828, -5.8828, -5.8828, -1.0801, 1.7598, -5.8828, -5.8828, -5.8828, -5.8828, -5.8828, -5.8828, -5.8828, -5.8828, -5.8828, -5.8828, -5.8828, -5.8828, -5.8828, -5.8828, -5.8828, -5.8828]),
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should update in #29905 but forgot

8: torch.tensor([-5.8711, -5.8555, -0.1050, -4.7148, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -1.0781, 1.7568, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711]),
9: torch.tensor([-5.8750, -5.8594, -0.1047, -4.7188, -5.8750, -5.8750, -5.8750, -5.8750, -5.8750, -5.8750, -5.8750, -5.8750, -1.0781, 1.7578, -5.8750, -5.8750, -5.8750, -5.8750, -5.8750, -5.8750, -5.8750, -5.8750, -5.8750, -5.8750, -5.8750, -5.8750, -5.8750, -5.8750, -5.8750, -5.8750]),
} # fmt: skip
Expand All @@ -535,15 +535,11 @@ def test_model_7b_logits(self):
out[0, 0, :30], EXPECTED_SLICE[self.cuda_compute_capability_major_version], atol=1e-4, rtol=1e-4
)

del model
backend_empty_cache(torch_device)
gc.collect()

@slow
@require_bitsandbytes
def test_model_7b_generation(self):
EXPECTED_TEXT_COMPLETION = {
7: "My favourite condiment is 100% ketchup. I love it on everything. I'm not a big",
7: "My favourite condiment is 100% ketchup. Im not a fan of mustard, mayo,",
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should update in #29905 but forgot

8: "My favourite condiment is 100% ketchup. I’m not a fan of mustard, mayo,",
}

Expand All @@ -559,10 +555,6 @@ def test_model_7b_generation(self):
text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
self.assertEqual(EXPECTED_TEXT_COMPLETION[self.cuda_compute_capability_major_version], text)

del model
backend_empty_cache(torch_device)
gc.collect()

Comment on lines -562 to -565
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not help and worse cause some GPU OOM in subsequent tests

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Happy to have this deleted but very confused why this would cause OOM 😭

Copy link
Collaborator Author

@ydshieh ydshieh Jun 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got to say I am confused too. torch.cuda.empty_cache is not really a magic

empty_cache() doesn’t increase the amount of GPU memory available for PyTorch.

but I was not expecting it would have undesired side-effect like this (even if it is not helpful).

I don't check if del model and gc.collect() plays a role here though.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Out of curiosity and keep info here for the record:

  • it is test_model_7b_long_prompt gets OOM.
    • previously with those empty cache, at the beginning of test_model_7b_long_prompt, nvidia-smi shows 150MiB / 15360MiB which looks nice but we get OOM afterward inside this test
    • wihout empty cache, nvidia-smi shows 9066MiB/ 15360MiB which looks not great but we DON'T get OOM afterward inside this test

It's very mysterious to me.

@require_bitsandbytes
@slow
@require_flash_attn
Expand All @@ -587,11 +579,6 @@ def test_model_7b_long_prompt(self):
generated_ids = model.generate(input_ids, max_new_tokens=4, temperature=0)
self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-2:].tolist())

del assistant_model
del model
backend_empty_cache(torch_device)
gc.collect()

@slow
@require_torch_sdpa
def test_model_7b_long_prompt_sdpa(self):
Expand Down Expand Up @@ -635,7 +622,7 @@ def test_speculative_generation(self):
# Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s,
# considering differences in hardware processing and potential deviations in generated text.
EXPECTED_TEXT_COMPLETION = {
7: "My favourite condiment is 100% Sriracha. I love the heat, the tang and the fact costs",
7: "My favourite condiment is 100% ketchup. I love it on everything. I’m not a big",
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see PR description

8: "My favourite condiment is 100% ketchup. I love it on everything. I’m not a big",
9: "My favourite condiment is 100% ketchup. I love it on everything. I’m not a big",
}
Expand All @@ -654,10 +641,6 @@ def test_speculative_generation(self):
text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
self.assertEqual(EXPECTED_TEXT_COMPLETION[self.cuda_compute_capability_major_version], text)

del model
backend_empty_cache(torch_device)
gc.collect()

@slow
@require_read_token
def test_compile_static_cache(self):
Expand Down Expand Up @@ -726,10 +709,6 @@ def test_compile_static_cache(self):
static_compiled_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
self.assertEqual(EXPECTED_TEXT_COMPLETION[self.cuda_compute_capability_major_version], static_compiled_text)

del model
backend_empty_cache(torch_device)
gc.collect()


@slow
@require_torch_gpu
Expand Down