Skip to content

Commit a218c4d

Browse files
hynky1999NathanHB
andauthored
Logging Revamp (#284)
What does this implement/fix? Explain your changes. --------------------------------------------------- This PR revamps output saving system. - Adds fsspec support for output directory - Fixes bug as the save_details parameter was never used or passed (now it's used) - Since one can now directly push results/details to hub, I reduce the push_x_to_hub to just push_hub, which behaves just like the push_details_to_hub was working before - Fixes the task_name extraction so that it doesn't explode once we move to year 2025 🫠 - Adds some tests for checking that the evaluation results are saved correctly, I ommited tests for tensorbard logging as I haven't changed anything there Comments ---------- - The tests now require HF_TOKEN, which can write/read in `lighteval-tests`. I have created org I can give the ownership to either @clefourrier or @NathanHB. - Having secrets accessible during tests is big security risk especially when the tests can run without any interaction on PRs, but if the token has only permission to the lighteval-tests org I think it's fine - We should probably first merge a PR which gives ownership to lighteval over the lighteval config. Right now I can adjust the config for lighteval for nanotron path to reflect the new api. You can review the PR in the meantime but I added todo, so that we don't forget that. PS: That PR doesn't exist yet. --------- Co-authored-by: Nathan Habib <[email protected]> Co-authored-by: Nathan Habib <[email protected]>
1 parent a84053d commit a218c4d

File tree

9 files changed

+342
-130
lines changed

9 files changed

+342
-130
lines changed

.github/workflows/tests.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ jobs:
3434
path: "cache"
3535
key: test-cache-HF
3636
- name: Test
37+
env:
38+
HF_TEST_TOKEN: ${{ secrets.HF_TEST_TOKEN }}
3739
run: | # PYTHONPATH="${PYTHONPATH}:src" HF_DATASETS_CACHE="cache/datasets" HF_HOME="cache/models"
3840
python -m pytest --disable-pytest-warnings
3941
- name: Write cache

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ dependencies = [
7474
"sentencepiece>=0.1.99",
7575
"protobuf==3.20.*", # pinned for sentencepiece compat
7676
"pycountry",
77+
"fsspec>=2023.12.2",
7778
]
7879

7980
[project.optional-dependencies]
@@ -95,6 +96,7 @@ extended_tasks = [
9596
"langdetect", # ifeval
9697
"openai", # llm as a judge using openai models
9798
]
99+
s3 = ["s3fs"]
98100

99101
[project.urls]
100102
Homepage = "https://github.com/huggingface/lighteval"

src/lighteval/config/lighteval_config.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,8 @@ class LightEvalLoggingArgs:
5858

5959
output_dir: str
6060
save_details: bool = True
61-
push_results_to_hub: bool = False
62-
push_details_to_hub: bool = False
63-
push_results_to_tensorboard: bool = False
61+
push_to_hub: bool = False
62+
push_to_tensorboard: bool = False
6463
public_run: bool = False
6564
results_org: str | None = None
6665
tensorboard_metric_prefix: str = "eval"

src/lighteval/logging/evaluation_tracker.py

Lines changed: 97 additions & 116 deletions
Large diffs are not rendered by default.

src/lighteval/main_accelerate.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,11 @@ def main(args):
4848
env_config = EnvConfig(token=TOKEN, cache_dir=args.cache_dir)
4949
evaluation_tracker = EvaluationTracker(
5050
output_dir=args.output_dir,
51-
hub_results_org=args.results_org,
52-
push_results_to_hub=args.push_results_to_hub,
53-
push_details_to_hub=args.push_details_to_hub,
54-
push_results_to_tensorboard=args.push_results_to_tensorboard,
51+
save_details=args.save_details,
52+
push_to_hub=args.push_to_hub,
53+
push_to_tensorboard=args.push_to_tensorboard,
5554
public=args.public_run,
56-
token=TOKEN,
55+
hub_results_org=args.results_org,
5756
)
5857
pipeline_params = PipelineParameters(
5958
launcher_type=ParallelismManager.ACCELERATE,

src/lighteval/main_nanotron.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,10 @@ def main(
6969
evaluation_tracker = EvaluationTracker(
7070
output_dir=lighteval_config.logging.output_dir,
7171
hub_results_org=lighteval_config.logging.results_org,
72+
public=lighteval_config.logging.public_run,
73+
push_to_hub=lighteval_config.logging.push_to_hub,
74+
push_to_tensorboard=lighteval_config.logging.push_to_tensorboard,
75+
save_details=lighteval_config.logging.save_details,
7276
tensorboard_metric_prefix=lighteval_config.logging.tensorboard_metric_prefix,
7377
nanotron_run_info=nanotron_config.nanotron_config.general,
7478
)

src/lighteval/parsers.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,15 +54,15 @@ def parser_accelerate(parser=None):
5454
parser.add_argument("--job_id", type=str, help="Optional Job ID for future reference", default="")
5555

5656
# Saving
57-
parser.add_argument("--output_dir", required=True, type=str, help="Directory to save the results")
5857
parser.add_argument(
59-
"--push_results_to_hub", default=False, action="store_true", help="Set to push the results to the hub"
58+
"--output_dir",
59+
required=True,
60+
type=str,
61+
help="Directory to save the results, fsspec compliant (e.g. s3://bucket/path)",
6062
)
6163
parser.add_argument("--save_details", action="store_true", help="Save the details of the run in the output_dir")
62-
parser.add_argument(
63-
"--push_details_to_hub", default=False, action="store_true", help="Set to push the details to the hub"
64-
)
65-
parser.add_argument("--push_results_to_tensorboard", default=False, action="store_true")
64+
parser.add_argument("--push_to_hub", default=False, action="store_true", help="Set to push the details to the hub")
65+
parser.add_argument("--push_to_tensorboard", default=False, action="store_true")
6666
parser.add_argument(
6767
"--public_run", default=False, action="store_true", help="Push results and details to a public repo"
6868
)

tests/fixtures.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# MIT License
2+
3+
# Copyright (c) 2024 The HuggingFace Team
4+
5+
# Permission is hereby granted, free of charge, to any person obtaining a copy
6+
# of this software and associated documentation files (the "Software"), to deal
7+
# in the Software without restriction, including without limitation the rights
8+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
# copies of the Software, and to permit persons to whom the Software is
10+
# furnished to do so, subject to the following conditions:
11+
12+
# The above copyright notice and this permission notice shall be included in all
13+
# copies or substantial portions of the Software.
14+
15+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
# SOFTWARE.
22+
23+
24+
import os
25+
26+
import pytest
27+
from huggingface_hub import HfApi
28+
from huggingface_hub.hf_api import DatasetInfo
29+
30+
31+
TESTING_EMPTY_HF_ORG_ID = "lighteval-tests"
32+
33+
34+
@pytest.fixture
35+
def testing_empty_hf_org_id(org_id: str = TESTING_EMPTY_HF_ORG_ID):
36+
old_token = os.getenv("HF_TOKEN")
37+
os.environ["HF_TOKEN"] = os.getenv("HF_TEST_TOKEN")
38+
39+
def list_repos(org_id: str):
40+
return list(hf_api.list_models(author=org_id)) + list(hf_api.list_datasets(author=org_id))
41+
42+
def clean_repos(org_id: str):
43+
repos = list_repos(org_id)
44+
for repo in repos:
45+
hf_api.delete_repo(repo.id, repo_type="dataset" if isinstance(repo, DatasetInfo) else "model")
46+
47+
hf_api = HfApi()
48+
# Remove all repositories in the HF org
49+
clean_repos(org_id)
50+
51+
# Verify that all repositories have been removed
52+
remaining_repos = list_repos(org_id)
53+
assert len(remaining_repos) == 0, f"Expected 0 repositories, but found {len(remaining_repos)}"
54+
55+
yield org_id
56+
57+
# Clean up: recreate any necessary default repositories after the test
58+
# This step is optional and depends on your specific needs
59+
clean_repos(org_id)
60+
os.environ["HF_TOKEN"] = old_token if old_token is not None else ""
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
# MIT License
2+
3+
# Copyright (c) 2024 The HuggingFace Team
4+
5+
# Permission is hereby granted, free of charge, to any person obtaining a copy
6+
# of this software and associated documentation files (the "Software"), to deal
7+
# in the Software without restriction, including without limitation the rights
8+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
# copies of the Software, and to permit persons to whom the Software is
10+
# furnished to do so, subject to the following conditions:
11+
12+
# The above copyright notice and this permission notice shall be included in all
13+
# copies or substantial portions of the Software.
14+
15+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
# SOFTWARE.
22+
23+
import json
24+
import os
25+
import tempfile
26+
from datetime import datetime
27+
from pathlib import Path
28+
29+
import pytest
30+
from datasets import Dataset
31+
from huggingface_hub import HfApi
32+
33+
from lighteval.logging.evaluation_tracker import EvaluationTracker
34+
from lighteval.logging.info_loggers import DetailsLogger
35+
36+
# ruff: noqa
37+
from tests.fixtures import TESTING_EMPTY_HF_ORG_ID, testing_empty_hf_org_id
38+
39+
40+
@pytest.fixture
41+
def mock_evaluation_tracker(request):
42+
passed_params = {}
43+
if request.keywords.get("evaluation_tracker"):
44+
passed_params = request.keywords["evaluation_tracker"].kwargs
45+
46+
with tempfile.TemporaryDirectory() as temp_dir:
47+
kwargs = {
48+
"output_dir": temp_dir,
49+
"save_details": passed_params.get("save_details", False),
50+
"push_to_hub": passed_params.get("push_to_hub", False),
51+
"push_to_tensorboard": passed_params.get("push_to_tensorboard", False),
52+
"hub_results_org": passed_params.get("hub_results_org", ""),
53+
}
54+
tracker = EvaluationTracker(**kwargs)
55+
tracker.general_config_logger.model_name = "test_model"
56+
yield tracker
57+
58+
59+
@pytest.fixture
60+
def mock_datetime(monkeypatch):
61+
mock_date = datetime(2023, 1, 1, 12, 0, 0)
62+
63+
class MockDatetime:
64+
@classmethod
65+
def now(cls):
66+
return mock_date
67+
68+
@classmethod
69+
def fromisoformat(cls, date_string: str):
70+
return mock_date
71+
72+
monkeypatch.setattr("lighteval.logging.evaluation_tracker.datetime", MockDatetime)
73+
return mock_date
74+
75+
76+
def test_results_logging(mock_evaluation_tracker: EvaluationTracker):
77+
task_metrics = {
78+
"task1": {"accuracy": 0.8, "f1": 0.75},
79+
"task2": {"precision": 0.9, "recall": 0.85},
80+
}
81+
mock_evaluation_tracker.metrics_logger.metric_aggregated = task_metrics
82+
83+
mock_evaluation_tracker.save()
84+
85+
results_dir = Path(mock_evaluation_tracker.output_dir) / "results" / "test_model"
86+
assert results_dir.exists()
87+
88+
result_files = list(results_dir.glob("results_*.json"))
89+
assert len(result_files) == 1
90+
91+
with open(result_files[0], "r") as f:
92+
saved_results = json.load(f)
93+
94+
assert "results" in saved_results
95+
assert saved_results["results"] == task_metrics
96+
assert saved_results["config_general"]["model_name"] == "test_model"
97+
98+
99+
@pytest.mark.evaluation_tracker(save_details=True)
100+
def test_details_logging(mock_evaluation_tracker, mock_datetime):
101+
task_details = {
102+
"task1": [DetailsLogger.CompiledDetail(truncated=10, padded=5)],
103+
"task2": [DetailsLogger.CompiledDetail(truncated=20, padded=10)],
104+
}
105+
mock_evaluation_tracker.details_logger.details = task_details
106+
107+
mock_evaluation_tracker.save()
108+
109+
date_id = mock_datetime.isoformat().replace(":", "-")
110+
details_dir = Path(mock_evaluation_tracker.output_dir) / "details" / "test_model" / date_id
111+
assert details_dir.exists()
112+
113+
for task in ["task1", "task2"]:
114+
file_path = details_dir / f"details_{task}_{date_id}.parquet"
115+
dataset = Dataset.from_parquet(str(file_path))
116+
assert len(dataset) == 1
117+
assert int(dataset[0]["truncated"]) == task_details[task][0].truncated
118+
assert int(dataset[0]["padded"]) == task_details[task][0].padded
119+
120+
121+
@pytest.mark.evaluation_tracker(save_details=False)
122+
def test_no_details_output(mock_evaluation_tracker: EvaluationTracker):
123+
mock_evaluation_tracker.save()
124+
125+
details_dir = Path(mock_evaluation_tracker.output_dir) / "details" / "test_model"
126+
assert not details_dir.exists()
127+
128+
129+
@pytest.mark.evaluation_tracker(push_to_hub=True, hub_results_org=TESTING_EMPTY_HF_ORG_ID)
130+
def test_push_to_hub_works(testing_empty_hf_org_id, mock_evaluation_tracker: EvaluationTracker, mock_datetime):
131+
# Prepare the dummy data
132+
task_metrics = {
133+
"task1": {"accuracy": 0.8, "f1": 0.75},
134+
"task2": {"precision": 0.9, "recall": 0.85},
135+
}
136+
mock_evaluation_tracker.metrics_logger.metric_aggregated = task_metrics
137+
138+
task_details = {
139+
"task1": [DetailsLogger.CompiledDetail(truncated=10, padded=5)],
140+
"task2": [DetailsLogger.CompiledDetail(truncated=20, padded=10)],
141+
}
142+
mock_evaluation_tracker.details_logger.details = task_details
143+
mock_evaluation_tracker.save()
144+
145+
# Verify using HfApi
146+
api = HfApi()
147+
148+
# Check if repo exists and it's private
149+
expected_repo_id = f"{testing_empty_hf_org_id}/details_test_model_private"
150+
assert api.repo_exists(repo_id=expected_repo_id, repo_type="dataset")
151+
assert api.repo_info(repo_id=expected_repo_id, repo_type="dataset").private
152+
153+
repo_files = api.list_repo_files(repo_id=expected_repo_id, repo_type="dataset")
154+
# Check if README.md exists
155+
assert any(file == "README.md" for file in repo_files)
156+
157+
# Check that both results files were uploaded
158+
result_files = [file for file in repo_files if file.startswith("results_")]
159+
assert len(result_files) == 2
160+
assert len([file for file in result_files if file.endswith(".json")]) == 1
161+
assert len([file for file in result_files if file.endswith(".parquet")]) == 1
162+
163+
# Check that the details dataset was uploaded
164+
details_files = [file for file in repo_files if "details_" in file and file.endswith(".parquet")]
165+
assert len(details_files) == 2

0 commit comments

Comments
 (0)