Logging Revamp (#284)

hynky1999 · NathanHB · web-flow · commit a218c4d96af1 · 2024-09-04T17:46:41.000+02:00
What does this implement/fix? Explain your changes. --------------------------------------------------- This PR revamps output saving system. - Adds fsspec support for output directory - Fixes bug as the save_details parameter was never used or passed (now it's used) - Since one can now directly push results/details to hub, I reduce the push_x_to_hub to just push_hub, which behaves just like the push_details_to_hub was working before - Fixes the task_name extraction so that it doesn't explode once we move to year 2025 🫠 - Adds some tests for checking that the evaluation results are saved correctly, I ommited tests for tensorbard logging as I haven't changed anything there Comments ---------- - The tests now require HF_TOKEN, which can write/read in `lighteval-tests`. I have created org I can give the ownership to either @clefourrier or @NathanHB. - Having secrets accessible during tests is big security risk especially when the tests can run without any interaction on PRs, but if the token has only permission to the lighteval-tests org I think it's fine - We should probably first merge a PR which gives ownership to lighteval over the lighteval config. Right now I can adjust the config for lighteval for nanotron path to reflect the new api. You can review the PR in the meantime but I added todo, so that we don't forget that. PS: That PR doesn't exist yet. --------- Co-authored-by: Nathan Habib <nathan.habib@huggingface.co> Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com>
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -34,6 +34,8 @@ jobs:
          path: "cache"
          key: test-cache-HF
      - name: Test
+       env:
+        HF_TEST_TOKEN: ${{ secrets.HF_TEST_TOKEN }}
        run: | # PYTHONPATH="${PYTHONPATH}:src" HF_DATASETS_CACHE="cache/datasets" HF_HOME="cache/models"
         python -m pytest --disable-pytest-warnings
      - name: Write cache
diff --git a/pyproject.toml b/pyproject.toml
@@ -74,6 +74,7 @@ dependencies = [
     "sentencepiece>=0.1.99",
     "protobuf==3.20.*", # pinned for sentencepiece compat
     "pycountry",
+    "fsspec>=2023.12.2",
 ]
 
 [project.optional-dependencies]
@@ -95,6 +96,7 @@ extended_tasks = [
   "langdetect", # ifeval
   "openai", # llm as a judge using openai models
 ]
+s3 = ["s3fs"]
 
 [project.urls]
 Homepage = "https://github.com/huggingface/lighteval"
diff --git a/src/lighteval/config/lighteval_config.py b/src/lighteval/config/lighteval_config.py
@@ -58,9 +58,8 @@ class LightEvalLoggingArgs:
 
     output_dir: str
     save_details: bool = True
-    push_results_to_hub: bool = False
-    push_details_to_hub: bool = False
-    push_results_to_tensorboard: bool = False
+    push_to_hub: bool = False
+    push_to_tensorboard: bool = False
     public_run: bool = False
     results_org: str | None = None
     tensorboard_metric_prefix: str = "eval"
diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py
@@ -48,12 +48,11 @@ def main(args):
     env_config = EnvConfig(token=TOKEN, cache_dir=args.cache_dir)
     evaluation_tracker = EvaluationTracker(
         output_dir=args.output_dir,
-        hub_results_org=args.results_org,
-        push_results_to_hub=args.push_results_to_hub,
-        push_details_to_hub=args.push_details_to_hub,
-        push_results_to_tensorboard=args.push_results_to_tensorboard,
+        save_details=args.save_details,
+        push_to_hub=args.push_to_hub,
+        push_to_tensorboard=args.push_to_tensorboard,
         public=args.public_run,
-        token=TOKEN,
+        hub_results_org=args.results_org,
     )
     pipeline_params = PipelineParameters(
         launcher_type=ParallelismManager.ACCELERATE,
diff --git a/src/lighteval/main_nanotron.py b/src/lighteval/main_nanotron.py
@@ -69,6 +69,10 @@ def main(
     evaluation_tracker = EvaluationTracker(
         output_dir=lighteval_config.logging.output_dir,
         hub_results_org=lighteval_config.logging.results_org,
+        public=lighteval_config.logging.public_run,
+        push_to_hub=lighteval_config.logging.push_to_hub,
+        push_to_tensorboard=lighteval_config.logging.push_to_tensorboard,
+        save_details=lighteval_config.logging.save_details,
         tensorboard_metric_prefix=lighteval_config.logging.tensorboard_metric_prefix,
         nanotron_run_info=nanotron_config.nanotron_config.general,
     )
diff --git a/src/lighteval/parsers.py b/src/lighteval/parsers.py
@@ -54,15 +54,15 @@ def parser_accelerate(parser=None):
     parser.add_argument("--job_id", type=str, help="Optional Job ID for future reference", default="")
 
     # Saving
-    parser.add_argument("--output_dir", required=True, type=str, help="Directory to save the results")
     parser.add_argument(
-        "--push_results_to_hub", default=False, action="store_true", help="Set to push the results to the hub"
+        "--output_dir",
+        required=True,
+        type=str,
+        help="Directory to save the results, fsspec compliant (e.g. s3://bucket/path)",
     )
     parser.add_argument("--save_details", action="store_true", help="Save the details of the run in the output_dir")
-    parser.add_argument(
-        "--push_details_to_hub", default=False, action="store_true", help="Set to push the details to the hub"
-    )
-    parser.add_argument("--push_results_to_tensorboard", default=False, action="store_true")
+    parser.add_argument("--push_to_hub", default=False, action="store_true", help="Set to push the details to the hub")
+    parser.add_argument("--push_to_tensorboard", default=False, action="store_true")
     parser.add_argument(
         "--public_run", default=False, action="store_true", help="Push results and details to a public repo"
     )
diff --git a/tests/fixtures.py b/tests/fixtures.py
@@ -0,0 +1,60 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+import os
+
+import pytest
+from huggingface_hub import HfApi
+from huggingface_hub.hf_api import DatasetInfo
+
+
+TESTING_EMPTY_HF_ORG_ID = "lighteval-tests"
+
+
+@pytest.fixture
+def testing_empty_hf_org_id(org_id: str = TESTING_EMPTY_HF_ORG_ID):
+    old_token = os.getenv("HF_TOKEN")
+    os.environ["HF_TOKEN"] = os.getenv("HF_TEST_TOKEN")
+
+    def list_repos(org_id: str):
+        return list(hf_api.list_models(author=org_id)) + list(hf_api.list_datasets(author=org_id))
+
+    def clean_repos(org_id: str):
+        repos = list_repos(org_id)
+        for repo in repos:
+            hf_api.delete_repo(repo.id, repo_type="dataset" if isinstance(repo, DatasetInfo) else "model")
+
+    hf_api = HfApi()
+    # Remove all repositories in the HF org
+    clean_repos(org_id)
+
+    # Verify that all repositories have been removed
+    remaining_repos = list_repos(org_id)
+    assert len(remaining_repos) == 0, f"Expected 0 repositories, but found {len(remaining_repos)}"
+
+    yield org_id
+
+    # Clean up: recreate any necessary default repositories after the test
+    # This step is optional and depends on your specific needs
+    clean_repos(org_id)
+    os.environ["HF_TOKEN"] = old_token if old_token is not None else ""
diff --git a/tests/logging/test_evaluation_tracker.py b/tests/logging/test_evaluation_tracker.py
@@ -0,0 +1,165 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import json
+import os
+import tempfile
+from datetime import datetime
+from pathlib import Path
+
+import pytest
+from datasets import Dataset
+from huggingface_hub import HfApi
+
+from lighteval.logging.evaluation_tracker import EvaluationTracker
+from lighteval.logging.info_loggers import DetailsLogger
+
+# ruff: noqa
+from tests.fixtures import TESTING_EMPTY_HF_ORG_ID, testing_empty_hf_org_id
+
+
+@pytest.fixture
+def mock_evaluation_tracker(request):
+    passed_params = {}
+    if request.keywords.get("evaluation_tracker"):
+        passed_params = request.keywords["evaluation_tracker"].kwargs
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        kwargs = {
+            "output_dir": temp_dir,
+            "save_details": passed_params.get("save_details", False),
+            "push_to_hub": passed_params.get("push_to_hub", False),
+            "push_to_tensorboard": passed_params.get("push_to_tensorboard", False),
+            "hub_results_org": passed_params.get("hub_results_org", ""),
+        }
+        tracker = EvaluationTracker(**kwargs)
+        tracker.general_config_logger.model_name = "test_model"
+        yield tracker
+
+
+@pytest.fixture
+def mock_datetime(monkeypatch):
+    mock_date = datetime(2023, 1, 1, 12, 0, 0)
+
+    class MockDatetime:
+        @classmethod
+        def now(cls):
+            return mock_date
+
+        @classmethod
+        def fromisoformat(cls, date_string: str):
+            return mock_date
+
+    monkeypatch.setattr("lighteval.logging.evaluation_tracker.datetime", MockDatetime)
+    return mock_date
+
+
+def test_results_logging(mock_evaluation_tracker: EvaluationTracker):
+    task_metrics = {
+        "task1": {"accuracy": 0.8, "f1": 0.75},
+        "task2": {"precision": 0.9, "recall": 0.85},
+    }
+    mock_evaluation_tracker.metrics_logger.metric_aggregated = task_metrics
+
+    mock_evaluation_tracker.save()
+
+    results_dir = Path(mock_evaluation_tracker.output_dir) / "results" / "test_model"
+    assert results_dir.exists()
+
+    result_files = list(results_dir.glob("results_*.json"))
+    assert len(result_files) == 1
+
+    with open(result_files[0], "r") as f:
+        saved_results = json.load(f)
+
+    assert "results" in saved_results
+    assert saved_results["results"] == task_metrics
+    assert saved_results["config_general"]["model_name"] == "test_model"
+
+
+@pytest.mark.evaluation_tracker(save_details=True)
+def test_details_logging(mock_evaluation_tracker, mock_datetime):
+    task_details = {
+        "task1": [DetailsLogger.CompiledDetail(truncated=10, padded=5)],
+        "task2": [DetailsLogger.CompiledDetail(truncated=20, padded=10)],
+    }
+    mock_evaluation_tracker.details_logger.details = task_details
+
+    mock_evaluation_tracker.save()
+
+    date_id = mock_datetime.isoformat().replace(":", "-")
+    details_dir = Path(mock_evaluation_tracker.output_dir) / "details" / "test_model" / date_id
+    assert details_dir.exists()
+
+    for task in ["task1", "task2"]:
+        file_path = details_dir / f"details_{task}_{date_id}.parquet"
+        dataset = Dataset.from_parquet(str(file_path))
+        assert len(dataset) == 1
+        assert int(dataset[0]["truncated"]) == task_details[task][0].truncated
+        assert int(dataset[0]["padded"]) == task_details[task][0].padded
+
+
+@pytest.mark.evaluation_tracker(save_details=False)
+def test_no_details_output(mock_evaluation_tracker: EvaluationTracker):
+    mock_evaluation_tracker.save()
+
+    details_dir = Path(mock_evaluation_tracker.output_dir) / "details" / "test_model"
+    assert not details_dir.exists()
+
+
+@pytest.mark.evaluation_tracker(push_to_hub=True, hub_results_org=TESTING_EMPTY_HF_ORG_ID)
+def test_push_to_hub_works(testing_empty_hf_org_id, mock_evaluation_tracker: EvaluationTracker, mock_datetime):
+    # Prepare the dummy data
+    task_metrics = {
+        "task1": {"accuracy": 0.8, "f1": 0.75},
+        "task2": {"precision": 0.9, "recall": 0.85},
+    }
+    mock_evaluation_tracker.metrics_logger.metric_aggregated = task_metrics
+
+    task_details = {
+        "task1": [DetailsLogger.CompiledDetail(truncated=10, padded=5)],
+        "task2": [DetailsLogger.CompiledDetail(truncated=20, padded=10)],
+    }
+    mock_evaluation_tracker.details_logger.details = task_details
+    mock_evaluation_tracker.save()
+
+    # Verify using HfApi
+    api = HfApi()
+
+    # Check if repo exists and it's private
+    expected_repo_id = f"{testing_empty_hf_org_id}/details_test_model_private"
+    assert api.repo_exists(repo_id=expected_repo_id, repo_type="dataset")
+    assert api.repo_info(repo_id=expected_repo_id, repo_type="dataset").private
+
+    repo_files = api.list_repo_files(repo_id=expected_repo_id, repo_type="dataset")
+    # Check if README.md exists
+    assert any(file == "README.md" for file in repo_files)
+
+    # Check that both results files were uploaded
+    result_files = [file for file in repo_files if file.startswith("results_")]
+    assert len(result_files) == 2
+    assert len([file for file in result_files if file.endswith(".json")]) == 1
+    assert len([file for file in result_files if file.endswith(".parquet")]) == 1
+
+    # Check that the details dataset was uploaded
+    details_files = [file for file in repo_files if "details_" in file and file.endswith(".parquet")]
+    assert len(details_files) == 2

Original file line number	Diff line number	Diff line change
`@@ -74,6 +74,7 @@ dependencies = [`
`74`	`74`	`"sentencepiece>=0.1.99",`
`75`	`75`	`"protobuf==3.20.*", # pinned for sentencepiece compat`
`76`	`76`	`"pycountry",`
	`77`	`+ "fsspec>=2023.12.2",`
`77`	`78`	`]`
`78`	`79`
`79`	`80`	`[project.optional-dependencies]`
`@@ -95,6 +96,7 @@ extended_tasks = [`
`95`	`96`	`"langdetect", # ifeval`
`96`	`97`	`"openai", # llm as a judge using openai models`
`97`	`98`	`]`
	`99`	`+s3 = ["s3fs"]`
`98`	`100`
`99`	`101`	`[project.urls]`
`100`	`102`	`Homepage = "https://github.com/huggingface/lighteval"`