diff --git a/mteb/evaluate.py b/mteb/evaluate.py index 0b24b37074..baf15ded13 100644 --- a/mteb/evaluate.py +++ b/mteb/evaluate.py @@ -132,8 +132,8 @@ def _evaluate_task( task.check_if_dataset_is_superseded() - data_loaded = task.data_loaded - if not data_loaded: + data_preloaded = task.data_loaded + if not data_preloaded: try: task.load_data() except DatasetNotFoundError as e: @@ -176,7 +176,7 @@ def _evaluate_task( kg_co2_emissions=None, ) - if data_loaded: # only unload if we loaded the data + if not data_preloaded: # only unload if we loaded the data task.unload_data() return result diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py index 2a60e0df74..490d140999 100644 --- a/tests/test_evaluate.py +++ b/tests/test_evaluate.py @@ -271,3 +271,28 @@ def load_error(): results = mteb.evaluate(model, [error_task, task], cache=None, raise_error=False) assert len(results.task_results) == 1 assert len(results.exceptions) == 1 + + +def test_evaluate_unloads_data_when_not_preloaded(): + """Test that evaluate() unloads data when it was not preloaded.""" + model = MockSentenceTransformer() + task = MockClassificationTask() + + assert task.data_loaded is False + mteb.evaluate(model, task, cache=None, co2_tracker=False) + assert task.data_loaded is False, "evaluate() should unload data it loaded" + + +def test_evaluate_preserves_preloaded_data_across_multiple_calls(): + """Test that preloaded data persists across multiple evaluate() calls.""" + model = MockSentenceTransformer() + task = MockClassificationTask() + + task.load_data() + assert task.data_loaded is True + + mteb.evaluate(model, task, cache=None, co2_tracker=False) + _ = task.dataset["test"] # Verify dataset wasn't unloaded + + mteb.evaluate(model, task, cache=None, co2_tracker=False) + _ = task.dataset["test"] # Verify dataset persists across multiple calls