diff --git a/Makefile b/Makefile index e40c191a73..fa7a67aca7 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ install: install-for-tests: @echo "--- ๐Ÿš€ Installing project dependencies for test ---" @echo "This ensures that the project is not installed in editable mode" - pip install ".[bm25s,pylate,image,codecarbon,faiss-cpu]" --group dev + pip install ".[bm25s,pylate,image,codecarbon,leaderboard,faiss-cpu]" --group dev lint: @echo "--- ๐Ÿงน Running linters ---" diff --git a/docs/usage/leaderboard.md b/docs/usage/leaderboard.md index 47ee1d8860..f55871cb14 100644 --- a/docs/usage/leaderboard.md +++ b/docs/usage/leaderboard.md @@ -7,7 +7,35 @@ This section contains information on how to interact with the leaderboard includ It is possible to completely deploy the leaderboard locally or self-host it. This can e.g. be relevant for companies that might want to integrate build their own benchmarks or integrate custom tasks into existing benchmarks. -Running the leaderboard is quite easy. Simply run: +The leaderboard can be run in two ways: + +#### Using the CLI Command + +The easiest way to run the leaderboard is using the MTEB CLI: + +```bash +mteb leaderboard +``` + +You can also specify a custom cache path for model results: + +```bash +mteb leaderboard --cache-path results +``` + +Additional options: +- `--host HOST`: Specify the host to run the server on (default: 0.0.0.0) +- `--port PORT`: Specify the port to run the server on (default: 7860) +- `--share`: Create a public URL for the leaderboard + +Example with all options: +```bash +mteb leaderboard --cache-path results --port 8080 --share +``` + +#### Using Make Command + +Alternatively, you can use the Makefile: ```bash make run-leaderboard ``` diff --git a/mteb/cli/build_cli.py b/mteb/cli/build_cli.py index c307320a4a..4aad361db7 100644 --- a/mteb/cli/build_cli.py +++ b/mteb/cli/build_cli.py @@ -361,6 +361,95 @@ def _add_create_meta_parser(subparsers) -> None: parser.set_defaults(func=_create_meta) +def _add_leaderboard_parser(subparsers) -> None: + parser = subparsers.add_parser("leaderboard", help="Launch the MTEB leaderboard") + + parser.add_argument( + "--cache-path", + type=str, + help="Path to the cache folder containing model results", + required=False, + default=None, + ) + parser.add_argument( + "--host", + type=str, + default="0.0.0.0", + help="Host to run the leaderboard server on", + ) + parser.add_argument( + "--port", + type=int, + default=7860, + help="Port to run the leaderboard server on", + ) + parser.add_argument( + "--share", + action="store_true", + default=False, + help="Create a public URL for the leaderboard", + ) + + parser.set_defaults(func=_leaderboard) + + +def _leaderboard(args: argparse.Namespace) -> None: + """Launch the MTEB leaderboard with specified cache path.""" + # Import leaderboard module only when needed to avoid requiring leaderboard dependencies + # for other CLI commands + try: + import gradio as gr + + from mteb.leaderboard import get_leaderboard_app + except ImportError as e: + raise ImportError( + "Seems like some dependencies are not installed. " + + "You can likely install these using: `pip install mteb[leaderboard]`. " + + f"{e}" + ) + + cache_path = args.cache_path + + if cache_path: + logger.info(f"Using cache path: {cache_path}") + cache = ResultCache(cache_path) + else: + cache = ResultCache() + logger.info(f"Using default cache path: {cache.cache_path}") + + app = get_leaderboard_app(cache) + + logger.info(f"Starting leaderboard on {args.host}:{args.port}") + if args.share: + logger.info("Creating public URL...") + + logging.getLogger("mteb.load_results.task_results").setLevel( + logging.ERROR + ) # Warnings related to task split + logging.getLogger("mteb.model_meta").setLevel( + logging.ERROR + ) # Warning related to model metadata (fetch_from_hf=False) + logging.getLogger("mteb.load_results.benchmark_results").setLevel( + logging.ERROR + ) # Warning related to model metadata (fetch_from_hf=False) + warnings.filterwarnings("ignore", message="Couldn't get scores for .* due to .*") + + # Head content for Tailwind CSS + head = """ + + """ + + app.launch( + server_name=args.host, + server_port=args.port, + share=args.share, + theme=gr.themes.Soft( + font=[gr.themes.GoogleFont("Roboto Mono"), "Arial", "sans-serif"], + ), + head=head, + ) + + def build_cli() -> argparse.ArgumentParser: """Builds the argument parser for the MTEB CLI. @@ -380,6 +469,7 @@ def build_cli() -> argparse.ArgumentParser: _add_available_tasks_parser(subparsers) _add_available_benchmarks_parser(subparsers) _add_create_meta_parser(subparsers) + _add_leaderboard_parser(subparsers) return parser diff --git a/tests/test_cli.py b/tests/test_cli.py index 5637af21bb..68d99efc91 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -4,6 +4,7 @@ import sys from argparse import Namespace from pathlib import Path +from unittest.mock import MagicMock, patch import pytest import yaml @@ -12,6 +13,7 @@ _available_benchmarks, _available_tasks, _create_meta, + _leaderboard, run, ) @@ -197,3 +199,107 @@ def test_create_meta_from_existing( command = f"{sys.executable} -m mteb create-model-results --model-name {model_name} --results-folder {output_folder.as_posix()} --output-path {output_path.as_posix()} --from-existing {existing_readme.as_posix()} --overwrite" result = subprocess.run(command, shell=True, capture_output=True, text=True) assert result.returncode == 0, "Command failed" + + +def test_leaderboard_help(): + """Test that leaderboard help command works.""" + command = [sys.executable, "-m", "mteb", "leaderboard", "--help"] + result = subprocess.run(command, capture_output=True, text=True) + + assert result.returncode == 0, "Leaderboard help command failed" + assert "--cache-path" in result.stdout, "--cache-path option not found in help" + assert "--host" in result.stdout, "--host option not found in help" + assert "--port" in result.stdout, "--port option not found in help" + assert "--share" in result.stdout, "--share option not found in help" + assert "Path to the cache folder containing model results" in result.stdout, ( + "Cache path description not found" + ) + + +@pytest.mark.parametrize( + "cache_path_input,host,port,share,test_description", + [ + ("custom", "localhost", 8080, True, "custom cache path"), + (None, "127.0.0.1", 7860, False, "default cache path"), + ], +) +def test_leaderboard_cache_paths( + tmp_path: Path, cache_path_input, host, port, share, test_description +): + """Test leaderboard with different cache path configurations.""" + + # Set up cache path based on parameter + if cache_path_input == "custom": + custom_cache = tmp_path / "my_results" + custom_cache.mkdir(exist_ok=True) + cache_path = str(custom_cache) + expected_cache_path = custom_cache + else: + cache_path = None + from mteb.cache import ResultCache + + expected_cache_path = ResultCache().default_cache_path + + # Mock the get_leaderboard_app function and the gradio app + mock_app = MagicMock() + mock_app.launch = MagicMock() + + # Create a mock function that captures the cache argument and returns our mock app + def mock_get_app_func(cache): + # Store the cache for verification + mock_get_app_func.called_with_cache = cache + return mock_app + + # Mock gradio themes + mock_theme = MagicMock() + mock_font = MagicMock() + + # Patch the local import inside _leaderboard function + with patch.dict( + "sys.modules", + { + "mteb.leaderboard": MagicMock(get_leaderboard_app=mock_get_app_func), + "gradio": MagicMock( + themes=MagicMock( + Soft=MagicMock(return_value=mock_theme), + GoogleFont=MagicMock(return_value=mock_font), + ) + ), + }, + ): + args = Namespace( + cache_path=cache_path, + host=host, + port=port, + share=share, + ) + + _leaderboard(args) + + # Verify get_leaderboard_app was called with a cache that has the correct path + assert hasattr(mock_get_app_func, "called_with_cache"), ( + "get_leaderboard_app was not called" + ) + cache_instance = mock_get_app_func.called_with_cache + assert cache_instance.cache_path == expected_cache_path, ( + f"Expected cache path {expected_cache_path}, got {cache_instance.cache_path}" + ) + + # Verify launch parameters + mock_app.launch.assert_called_once_with( + server_name=host, + server_port=port, + share=share, + theme=mock_theme, + head='\n \n ', + ) + + +def test_leaderboard_cli_integration(): + """Test the full CLI command integration.""" + # Test that the command is recognized by the CLI + command = [sys.executable, "-m", "mteb", "--help"] + result = subprocess.run(command, capture_output=True, text=True) + + assert result.returncode == 0 + assert "leaderboard" in result.stdout, "Leaderboard command not found in main help"