-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEvaluating language models sample
1 lines (1 loc) · 690 KB
/
Evaluating language models sample
1
{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.14","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"gpu","dataSources":[],"dockerImageVersionId":30761,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"# Overview\n\n","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19"}},{"cell_type":"markdown","source":"This is a sample note about how to evaluate language model which is directly copied from Aisuko's note \"Evaluating language models \"","metadata":{}},{"cell_type":"code","source":"# !pip install -U -q lm-eval==0.4.3","metadata":{"execution":{"iopub.status.busy":"2024-09-02T06:36:44.292650Z","iopub.execute_input":"2024-09-02T06:36:44.293065Z","iopub.status.idle":"2024-09-02T06:36:44.297187Z","shell.execute_reply.started":"2024-09-02T06:36:44.293027Z","shell.execute_reply":"2024-09-02T06:36:44.296206Z"},"trusted":true},"execution_count":2,"outputs":[]},{"cell_type":"code","source":"!git clone https://github.com/EleutherAI/lm-evaluation-harness\n%cd lm-evaluation-harness\n!pip install -e .","metadata":{"execution":{"iopub.status.busy":"2024-09-02T06:36:44.299054Z","iopub.execute_input":"2024-09-02T06:36:44.299665Z","iopub.status.idle":"2024-09-02T06:37:31.132285Z","shell.execute_reply.started":"2024-09-02T06:36:44.299632Z","shell.execute_reply":"2024-09-02T06:37:31.131320Z"},"trusted":true},"execution_count":3,"outputs":[{"name":"stdout","text":"Cloning into 'lm-evaluation-harness'...\nremote: Enumerating objects: 40187, done.\u001b[K\nremote: Counting objects: 100% (516/516), done.\u001b[K\nremote: Compressing objects: 100% (360/360), done.\u001b[K\nremote: Total 40187 (delta 255), reused 386 (delta 155), pack-reused 39671 (from 1)\u001b[K\nReceiving objects: 100% (40187/40187), 26.95 MiB | 20.81 MiB/s, done.\nResolving deltas: 100% (28169/28169), done.\n/kaggle/working/lm-evaluation-harness\nObtaining file:///kaggle/working/lm-evaluation-harness\n Installing build dependencies ... \u001b[?25ldone\n\u001b[?25h Checking if build backend supports build_editable ... \u001b[?25ldone\n\u001b[?25h Getting requirements to build editable ... \u001b[?25ldone\n\u001b[?25h Preparing editable metadata (pyproject.toml) ... \u001b[?25ldone\n\u001b[?25hRequirement already satisfied: accelerate>=0.26.0 in /opt/conda/lib/python3.10/site-packages (from lm_eval==0.4.3) (0.33.0)\nCollecting evaluate (from lm_eval==0.4.3)\n Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)\nRequirement already satisfied: datasets>=2.16.0 in /opt/conda/lib/python3.10/site-packages (from lm_eval==0.4.3) (2.21.0)\nCollecting jsonlines (from lm_eval==0.4.3)\n Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)\nRequirement already satisfied: numexpr in /opt/conda/lib/python3.10/site-packages (from lm_eval==0.4.3) (2.10.1)\nCollecting peft>=0.2.0 (from lm_eval==0.4.3)\n Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)\nRequirement already satisfied: pybind11>=2.6.2 in /opt/conda/lib/python3.10/site-packages (from lm_eval==0.4.3) (2.13.4)\nCollecting pytablewriter (from lm_eval==0.4.3)\n Downloading pytablewriter-1.2.0-py3-none-any.whl.metadata (37 kB)\nCollecting rouge-score>=0.0.4 (from lm_eval==0.4.3)\n Downloading rouge_score-0.1.2.tar.gz (17 kB)\n Preparing metadata (setup.py) ... \u001b[?25ldone\n\u001b[?25hCollecting sacrebleu>=1.5.0 (from lm_eval==0.4.3)\n Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)\n\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m51.8/51.8 kB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n\u001b[?25hRequirement already satisfied: scikit-learn>=0.24.1 in /opt/conda/lib/python3.10/site-packages (from lm_eval==0.4.3) (1.2.2)\nCollecting sqlitedict (from lm_eval==0.4.3)\n Downloading sqlitedict-2.1.0.tar.gz (21 kB)\n Preparing metadata (setup.py) ... \u001b[?25ldone\n\u001b[?25hRequirement already satisfied: torch>=1.8 in /opt/conda/lib/python3.10/site-packages (from lm_eval==0.4.3) (2.4.0)\nCollecting tqdm-multiprocess (from lm_eval==0.4.3)\n Downloading tqdm_multiprocess-0.0.11-py3-none-any.whl.metadata (5.7 kB)\nRequirement already satisfied: transformers>=4.1 in /opt/conda/lib/python3.10/site-packages (from lm_eval==0.4.3) (4.44.0)\nRequirement already satisfied: zstandard in /opt/conda/lib/python3.10/site-packages (from lm_eval==0.4.3) (0.23.0)\nRequirement already satisfied: dill in /opt/conda/lib/python3.10/site-packages (from lm_eval==0.4.3) (0.3.8)\nCollecting word2number (from lm_eval==0.4.3)\n Downloading word2number-1.1.zip (9.7 kB)\n Preparing metadata (setup.py) ... \u001b[?25ldone\n\u001b[?25hRequirement already satisfied: more-itertools in /opt/conda/lib/python3.10/site-packages (from lm_eval==0.4.3) (10.3.0)\nRequirement already satisfied: numpy<2.0.0,>=1.17 in /opt/conda/lib/python3.10/site-packages (from accelerate>=0.26.0->lm_eval==0.4.3) (1.26.4)\nRequirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.10/site-packages (from accelerate>=0.26.0->lm_eval==0.4.3) (21.3)\nRequirement already satisfied: psutil in /opt/conda/lib/python3.10/site-packages (from accelerate>=0.26.0->lm_eval==0.4.3) (5.9.3)\nRequirement already satisfied: pyyaml in /opt/conda/lib/python3.10/site-packages (from accelerate>=0.26.0->lm_eval==0.4.3) (6.0.2)\nRequirement already satisfied: huggingface-hub>=0.21.0 in /opt/conda/lib/python3.10/site-packages (from accelerate>=0.26.0->lm_eval==0.4.3) (0.24.6)\nRequirement already satisfied: safetensors>=0.3.1 in /opt/conda/lib/python3.10/site-packages (from accelerate>=0.26.0->lm_eval==0.4.3) (0.4.4)\nRequirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from datasets>=2.16.0->lm_eval==0.4.3) (3.15.1)\nRequirement already satisfied: pyarrow>=15.0.0 in /opt/conda/lib/python3.10/site-packages (from datasets>=2.16.0->lm_eval==0.4.3) (16.1.0)\nRequirement already satisfied: pandas in /opt/conda/lib/python3.10/site-packages (from datasets>=2.16.0->lm_eval==0.4.3) (2.2.2)\nRequirement already satisfied: requests>=2.32.2 in /opt/conda/lib/python3.10/site-packages (from datasets>=2.16.0->lm_eval==0.4.3) (2.32.3)\nRequirement already satisfied: tqdm>=4.66.3 in /opt/conda/lib/python3.10/site-packages (from datasets>=2.16.0->lm_eval==0.4.3) (4.66.4)\nRequirement already satisfied: xxhash in /opt/conda/lib/python3.10/site-packages (from datasets>=2.16.0->lm_eval==0.4.3) (3.4.1)\nRequirement already satisfied: multiprocess in /opt/conda/lib/python3.10/site-packages (from datasets>=2.16.0->lm_eval==0.4.3) (0.70.16)\nRequirement already satisfied: fsspec<=2024.6.1,>=2023.1.0 in /opt/conda/lib/python3.10/site-packages (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets>=2.16.0->lm_eval==0.4.3) (2024.6.1)\nRequirement already satisfied: aiohttp in /opt/conda/lib/python3.10/site-packages (from datasets>=2.16.0->lm_eval==0.4.3) (3.9.5)\nRequirement already satisfied: absl-py in /opt/conda/lib/python3.10/site-packages (from rouge-score>=0.0.4->lm_eval==0.4.3) (1.4.0)\nRequirement already satisfied: nltk in /opt/conda/lib/python3.10/site-packages (from rouge-score>=0.0.4->lm_eval==0.4.3) (3.2.4)\nRequirement already satisfied: six>=1.14.0 in /opt/conda/lib/python3.10/site-packages (from rouge-score>=0.0.4->lm_eval==0.4.3) (1.16.0)\nCollecting portalocker (from sacrebleu>=1.5.0->lm_eval==0.4.3)\n Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)\nRequirement already satisfied: regex in /opt/conda/lib/python3.10/site-packages (from sacrebleu>=1.5.0->lm_eval==0.4.3) (2024.5.15)\nRequirement already satisfied: tabulate>=0.8.9 in /opt/conda/lib/python3.10/site-packages (from sacrebleu>=1.5.0->lm_eval==0.4.3) (0.9.0)\nRequirement already satisfied: colorama in /opt/conda/lib/python3.10/site-packages (from sacrebleu>=1.5.0->lm_eval==0.4.3) (0.4.6)\nRequirement already satisfied: lxml in /opt/conda/lib/python3.10/site-packages (from sacrebleu>=1.5.0->lm_eval==0.4.3) (5.3.0)\nRequirement already satisfied: scipy>=1.3.2 in /opt/conda/lib/python3.10/site-packages (from scikit-learn>=0.24.1->lm_eval==0.4.3) (1.14.0)\nRequirement already satisfied: joblib>=1.1.1 in /opt/conda/lib/python3.10/site-packages (from scikit-learn>=0.24.1->lm_eval==0.4.3) (1.4.2)\nRequirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.10/site-packages (from scikit-learn>=0.24.1->lm_eval==0.4.3) (3.5.0)\nRequirement already satisfied: typing-extensions>=4.8.0 in /opt/conda/lib/python3.10/site-packages (from torch>=1.8->lm_eval==0.4.3) (4.12.2)\nRequirement already satisfied: sympy in /opt/conda/lib/python3.10/site-packages (from torch>=1.8->lm_eval==0.4.3) (1.13.2)\nRequirement already satisfied: networkx in /opt/conda/lib/python3.10/site-packages (from torch>=1.8->lm_eval==0.4.3) (3.3)\nRequirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from torch>=1.8->lm_eval==0.4.3) (3.1.4)\nRequirement already satisfied: tokenizers<0.20,>=0.19 in /opt/conda/lib/python3.10/site-packages (from transformers>=4.1->lm_eval==0.4.3) (0.19.1)\nRequirement already satisfied: attrs>=19.2.0 in /opt/conda/lib/python3.10/site-packages (from jsonlines->lm_eval==0.4.3) (23.2.0)\nRequirement already satisfied: setuptools>=38.3.0 in /opt/conda/lib/python3.10/site-packages (from pytablewriter->lm_eval==0.4.3) (70.0.0)\nCollecting DataProperty<2,>=1.0.1 (from pytablewriter->lm_eval==0.4.3)\n Downloading DataProperty-1.0.1-py3-none-any.whl.metadata (11 kB)\nCollecting mbstrdecoder<2,>=1.0.0 (from pytablewriter->lm_eval==0.4.3)\n Downloading mbstrdecoder-1.1.3-py3-none-any.whl.metadata (4.0 kB)\nCollecting pathvalidate<4,>=2.3.0 (from pytablewriter->lm_eval==0.4.3)\n Downloading pathvalidate-3.2.1-py3-none-any.whl.metadata (12 kB)\nCollecting tabledata<2,>=1.3.1 (from pytablewriter->lm_eval==0.4.3)\n Downloading tabledata-1.3.3-py3-none-any.whl.metadata (3.7 kB)\nCollecting tcolorpy<1,>=0.0.5 (from pytablewriter->lm_eval==0.4.3)\n Downloading tcolorpy-0.1.6-py3-none-any.whl.metadata (6.4 kB)\nCollecting typepy<2,>=1.3.2 (from typepy[datetime]<2,>=1.3.2->pytablewriter->lm_eval==0.4.3)\n Downloading typepy-1.3.2-py3-none-any.whl.metadata (9.3 kB)\nRequirement already satisfied: aiosignal>=1.1.2 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets>=2.16.0->lm_eval==0.4.3) (1.3.1)\nRequirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets>=2.16.0->lm_eval==0.4.3) (1.4.1)\nRequirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets>=2.16.0->lm_eval==0.4.3) (6.0.5)\nRequirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets>=2.16.0->lm_eval==0.4.3) (1.9.4)\nRequirement already satisfied: async-timeout<5.0,>=4.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets>=2.16.0->lm_eval==0.4.3) (4.0.3)\nCollecting chardet<6,>=3.0.4 (from mbstrdecoder<2,>=1.0.0->pytablewriter->lm_eval==0.4.3)\n Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)\nRequirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/conda/lib/python3.10/site-packages (from packaging>=20.0->accelerate>=0.26.0->lm_eval==0.4.3) (3.1.2)\nRequirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests>=2.32.2->datasets>=2.16.0->lm_eval==0.4.3) (3.3.2)\nRequirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests>=2.32.2->datasets>=2.16.0->lm_eval==0.4.3) (3.7)\nRequirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests>=2.32.2->datasets>=2.16.0->lm_eval==0.4.3) (1.26.18)\nRequirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests>=2.32.2->datasets>=2.16.0->lm_eval==0.4.3) (2024.7.4)\nRequirement already satisfied: python-dateutil<3.0.0,>=2.8.0 in /opt/conda/lib/python3.10/site-packages (from typepy[datetime]<2,>=1.3.2->pytablewriter->lm_eval==0.4.3) (2.9.0.post0)\nRequirement already satisfied: pytz>=2018.9 in /opt/conda/lib/python3.10/site-packages (from typepy[datetime]<2,>=1.3.2->pytablewriter->lm_eval==0.4.3) (2024.1)\nRequirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->torch>=1.8->lm_eval==0.4.3) (2.1.5)\nRequirement already satisfied: tzdata>=2022.7 in /opt/conda/lib/python3.10/site-packages (from pandas->datasets>=2.16.0->lm_eval==0.4.3) (2024.1)\nRequirement already satisfied: mpmath<1.4,>=1.1.0 in /opt/conda/lib/python3.10/site-packages (from sympy->torch>=1.8->lm_eval==0.4.3) (1.3.0)\nDownloading evaluate-0.4.2-py3-none-any.whl (84 kB)\n\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.1/84.1 kB\u001b[0m \u001b[31m4.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n\u001b[?25hDownloading peft-0.12.0-py3-none-any.whl (296 kB)\n\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m296.4/296.4 kB\u001b[0m \u001b[31m11.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n\u001b[?25hDownloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)\n\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m104.0/104.0 kB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n\u001b[?25hDownloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)\nDownloading pytablewriter-1.2.0-py3-none-any.whl (111 kB)\n\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m111.1/111.1 kB\u001b[0m \u001b[31m6.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n\u001b[?25hDownloading tqdm_multiprocess-0.0.11-py3-none-any.whl (9.8 kB)\nDownloading DataProperty-1.0.1-py3-none-any.whl (27 kB)\nDownloading mbstrdecoder-1.1.3-py3-none-any.whl (7.8 kB)\nDownloading pathvalidate-3.2.1-py3-none-any.whl (23 kB)\nDownloading tabledata-1.3.3-py3-none-any.whl (11 kB)\nDownloading tcolorpy-0.1.6-py3-none-any.whl (8.1 kB)\nDownloading typepy-1.3.2-py3-none-any.whl (31 kB)\nDownloading portalocker-2.10.1-py3-none-any.whl (18 kB)\nDownloading chardet-5.2.0-py3-none-any.whl (199 kB)\n\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.4/199.4 kB\u001b[0m \u001b[31m10.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n\u001b[?25hBuilding wheels for collected packages: lm_eval, rouge-score, sqlitedict, word2number\n Building editable for lm_eval (pyproject.toml) ... \u001b[?25ldone\n\u001b[?25h Created wheel for lm_eval: filename=lm_eval-0.4.3-0.editable-py3-none-any.whl size=18607 sha256=f3c4e364c01a1afb0dbd6d5b18d37f5247424da4a4571b67e4a18fc64ca4fee8\n Stored in directory: /tmp/pip-ephem-wheel-cache-ymq6qqnv/wheels/1b/1a/1b/44c80ddb18c9d7d3ce79a8d6d4561bddaddcbffb4cdfbf3259\n Building wheel for rouge-score (setup.py) ... \u001b[?25ldone\n\u001b[?25h Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=4fe4a0eb4bbada4afecf28b76a1db53487e8597a2b18f4851b11d24f7e6c2cfb\n Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4\n Building wheel for sqlitedict (setup.py) ... \u001b[?25ldone\n\u001b[?25h Created wheel for sqlitedict: filename=sqlitedict-2.1.0-py3-none-any.whl size=16862 sha256=fb839d1145e12765ec0769710bcc70798cc9f47da455a031fbf21dfd67b84246\n Stored in directory: /root/.cache/pip/wheels/79/d6/e7/304e0e6cb2221022c26d8161f7c23cd4f259a9e41e8bbcfabd\n Building wheel for word2number (setup.py) ... \u001b[?25ldone\n\u001b[?25h Created wheel for word2number: filename=word2number-1.1-py3-none-any.whl size=5566 sha256=f2768763bebd953c72253c9b4a9e28b1f6b354c00a23123ad8c8e15b1e50cfa2\n Stored in directory: /root/.cache/pip/wheels/84/ff/26/d3cfbd971e96c5aa3737ecfced81628830d7359b55fbb8ca3b\nSuccessfully built lm_eval rouge-score sqlitedict word2number\nInstalling collected packages: word2number, sqlitedict, tqdm-multiprocess, tcolorpy, portalocker, pathvalidate, jsonlines, chardet, sacrebleu, rouge-score, mbstrdecoder, typepy, peft, evaluate, DataProperty, tabledata, pytablewriter, lm_eval\nSuccessfully installed DataProperty-1.0.1 chardet-5.2.0 evaluate-0.4.2 jsonlines-4.0.0 lm_eval-0.4.3 mbstrdecoder-1.1.3 pathvalidate-3.2.1 peft-0.12.0 portalocker-2.10.1 pytablewriter-1.2.0 rouge-score-0.1.2 sacrebleu-2.4.3 sqlitedict-2.1.0 tabledata-1.3.3 tcolorpy-0.1.6 tqdm-multiprocess-0.0.11 typepy-1.3.2 word2number-1.1\n","output_type":"stream"}]},{"cell_type":"code","source":"import os\nfrom huggingface_hub import login\nfrom kaggle_secrets import UserSecretsClient\nuser_secrets = UserSecretsClient()\n\nos.environ[\"HF_TOKEN\"]=user_secrets.get_secret(\"HUGGINGFACE_TOKEN\")\n\nos.environ[\"WANDB_API_KEY\"]=user_secrets.get_secret(\"WANDB_API_KEY\")\nos.environ[\"WANDB_PROJECT\"] = \"Evaluating HuggingFace SmolLM-135M-Instruct\"\nos.environ[\"WANDB_NAME\"] = \"eva-smollm-135M-instruct\"\nos.environ[\"MODEL_NAME\"] = \"HuggingFaceTB/SmolLM-135M-Instruct\"\nos.environ[\"TOKENIZER_NAME\"] = \"HuggingFaceTB/SmolLM-135M-Instruct\"\nos.environ[\"DATASET\"] = \"HuggingFaceH4/ultrafeedback_binarized\"\n\nlogin(os.environ[\"HF_TOKEN\"])","metadata":{"execution":{"iopub.status.busy":"2024-09-02T06:37:31.133845Z","iopub.execute_input":"2024-09-02T06:37:31.134259Z","iopub.status.idle":"2024-09-02T06:37:32.103911Z","shell.execute_reply.started":"2024-09-02T06:37:31.134207Z","shell.execute_reply":"2024-09-02T06:37:32.102964Z"},"trusted":true},"execution_count":4,"outputs":[{"name":"stdout","text":"The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.\nToken is valid (permission: read).\nYour token has been saved to /root/.cache/huggingface/token\nLogin successful\n","output_type":"stream"}]},{"cell_type":"code","source":"!lm_eval --tasks list","metadata":{"_kg_hide-input":true,"execution":{"iopub.status.busy":"2024-09-02T06:37:32.105187Z","iopub.execute_input":"2024-09-02T06:37:32.105512Z","iopub.status.idle":"2024-09-02T06:38:25.133438Z","shell.execute_reply.started":"2024-09-02T06:37:32.105476Z","shell.execute_reply":"2024-09-02T06:38:25.132252Z"},"trusted":true},"execution_count":5,"outputs":[{"name":"stdout","text":"\n| Group | Config Location |\n|---------------------------------|------------------------------------------------------------------------|\n|aclue |lm_eval/tasks/aclue/_aclue.yaml |\n|aexams |lm_eval/tasks/aexams/_aexams.yaml |\n|agieval |lm_eval/tasks/agieval/agieval.yaml |\n|agieval_cn |lm_eval/tasks/agieval/agieval_cn.yaml |\n|agieval_en |lm_eval/tasks/agieval/agieval_en.yaml |\n|agieval_nous |lm_eval/tasks/agieval/agieval_nous.yaml |\n|arabicmmlu |lm_eval/tasks/arabicmmlu/_arabicmmlu.yaml |\n|arabicmmlu_humanities |lm_eval/tasks/arabicmmlu/_arabicmmlu_humanities.yaml |\n|arabicmmlu_language |lm_eval/tasks/arabicmmlu/_arabicmmlu_language.yaml |\n|arabicmmlu_other |lm_eval/tasks/arabicmmlu/_arabicmmlu_other.yaml |\n|arabicmmlu_social_science |lm_eval/tasks/arabicmmlu/_arabicmmlu_social_science.yaml |\n|arabicmmlu_stem |lm_eval/tasks/arabicmmlu/_arabicmmlu_stem.yaml |\n|bbh |lm_eval/tasks/bbh/cot_fewshot/_bbh.yaml |\n|bbh_cot_fewshot |lm_eval/tasks/bbh/cot_fewshot/_bbh_cot_fewshot.yaml |\n|bbh_cot_zeroshot |lm_eval/tasks/bbh/cot_zeroshot/_bbh_cot_zeroshot.yaml |\n|bbh_fewshot |lm_eval/tasks/bbh/fewshot/_bbh_fewshot.yaml |\n|bbh_zeroshot |lm_eval/tasks/bbh/zeroshot/_bbh_zeroshot.yaml |\n|belebele |lm_eval/tasks/belebele/_belebele.yaml |\n|blimp |lm_eval/tasks/blimp/_blimp.yaml |\n|ceval-valid |lm_eval/tasks/ceval/_ceval-valid.yaml |\n|cmmlu |lm_eval/tasks/cmmlu/_cmmlu.yaml |\n|csatqa |lm_eval/tasks/csatqa/_csatqa.yaml |\n|flan_held_in |lm_eval/tasks/benchmarks/flan/flan_held_in.yaml |\n|flan_held_out |lm_eval/tasks/benchmarks/flan/flan_held_out.yaml |\n|haerae |lm_eval/tasks/haerae/_haerae.yaml |\n|hendrycks_math |lm_eval/tasks/hendrycks_math/hendrycks_math.yaml |\n|kormedmcqa |lm_eval/tasks/kormedmcqa/_kormedmcqa.yaml |\n|leaderboard |lm_eval/tasks/leaderboard/leaderboard.yaml |\n|leaderboard_bbh |lm_eval/tasks/leaderboard/bbh_mc/_leaderboard_bbh.yaml |\n|leaderboard_gpqa |lm_eval/tasks/leaderboard/gpqa/_leaderboard_gpqa.yaml |\n|leaderboard_instruction_following|lm_eval/tasks/leaderboard/ifeval/_leaderboard_instruction_following.yaml|\n|leaderboard_math_hard |lm_eval/tasks/leaderboard/math/_leaderboard_math.yaml |\n|leaderboard_musr |lm_eval/tasks/leaderboard/musr/_musr.yaml |\n|lingoly |lm_eval/tasks/lingoly/lingoly_group.yaml |\n|med_concepts_qa |lm_eval/tasks/med_concepts_qa/_med_concepts_qa.yaml |\n|med_concepts_qa_atc |lm_eval/tasks/med_concepts_qa/_med_concepts_qa_atc.yaml |\n|med_concepts_qa_icd10cm |lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd10cm.yaml |\n|med_concepts_qa_icd10proc |lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd10proc.yaml |\n|med_concepts_qa_icd9cm |lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9cm.yaml |\n|med_concepts_qa_icd9proc |lm_eval/tasks/med_concepts_qa/_med_concepts_qa_icd9proc.yaml |\n|mela |lm_eval/tasks/mela/_mela.yaml |\n|minerva_math |lm_eval/tasks/benchmarks/minerva_math.yaml |\n|mmlu |lm_eval/tasks/mmlu/default/_mmlu.yaml |\n|mmlu_continuation |lm_eval/tasks/mmlu/continuation/_mmlu.yaml |\n|mmlu_flan_cot_fewshot |lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu.yaml |\n|mmlu_flan_cot_zeroshot |lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu.yaml |\n|mmlu_flan_n_shot_generative |lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu.yaml |\n|mmlu_flan_n_shot_loglikelihood |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu.yaml |\n|mmlu_generative |lm_eval/tasks/mmlu/generative/_mmlu.yaml |\n|mmlu_humanities |lm_eval/tasks/mmlu/default/_mmlu_humanities.yaml |\n|mmlu_other |lm_eval/tasks/mmlu/default/_mmlu_other.yaml |\n|mmlu_pro |lm_eval/tasks/mmlu_pro/_mmlu_pro.yaml |\n|mmlu_social_sciences |lm_eval/tasks/mmlu/default/_mmlu_social_sciences.yaml |\n|mmlu_stem |lm_eval/tasks/mmlu/default/_mmlu_stem.yaml |\n|mmlusr |lm_eval/tasks/mmlusr/question_and_answer/_question_and_answer.yaml |\n|mmlusr_answer_only |lm_eval/tasks/mmlusr/answer_only/_answer_only.yaml |\n|mmlusr_question_only |lm_eval/tasks/mmlusr/question_only/_question_only.yaml |\n|multimedqa |lm_eval/tasks/benchmarks/multimedqa/multimedqa.yaml |\n|openllm |lm_eval/tasks/benchmarks/openllm.yaml |\n|pawsx |lm_eval/tasks/paws-x/_pawsx.yaml |\n|pythia |lm_eval/tasks/benchmarks/pythia.yaml |\n|t0_eval |lm_eval/tasks/benchmarks/t0_eval.yaml |\n|tinyBenchmarks |lm_eval/tasks/tinyBenchmarks/tinyBenchmarks.yaml |\n|tmlu |lm_eval/tasks/tmlu/default/_tmlu.yaml |\n|tmmluplus |lm_eval/tasks/tmmluplus/default/tmmluplus.yaml |\n|wmdp |lm_eval/tasks/wmdp/_wmdp.yaml |\n|xcopa |lm_eval/tasks/xcopa/_xcopa.yaml |\n|xnli |lm_eval/tasks/xnli/_xnli.yaml |\n|xstorycloze |lm_eval/tasks/xstorycloze/_xstorycloze.yaml |\n|xwinograd |lm_eval/tasks/xwinograd/_xwinograd.yaml |\n\n\n| Tag |\n|------------------------------------------------|\n|advanced_ai_risk |\n|afrimgsm |\n|afrimgsm_direct |\n|afrimgsm_en_cot |\n|afrimgsm_translate |\n|afrimmlu |\n|afrimmlu_direct |\n|afrimmlu_translate |\n|afrixnli |\n|afrixnli_en_direct |\n|afrixnli_manual_direct |\n|afrixnli_native_direct |\n|afrixnli_translate |\n|ai2_arc |\n|anli |\n|arabicmmlu_humanities_tasks |\n|arabicmmlu_language_tasks |\n|arabicmmlu_other_tasks |\n|arabicmmlu_social_science_tasks |\n|arabicmmlu_stem_tasks |\n|arc_challenge_mt |\n|arc_multilingual |\n|arithmetic |\n|basque-glue |\n|bertaqa |\n|bigbench_generate_until |\n|bigbench_multiple_choice |\n|chain_of_thought |\n|codexglue_code2text |\n|copal_id |\n|crows_pairs |\n|eus_exams_es |\n|eus_exams_eu |\n|fld_logical_formula |\n|freebase |\n|french_bench |\n|french_bench_extra |\n|french_bench_gen |\n|french_bench_mc |\n|french_bench_perplexity |\n|glue |\n|gpqa |\n|gpt3_translation_benchmarks |\n|headqa |\n|hellaswag_multilingual |\n|hendrycks_ethics |\n|inverse_scaling_mc |\n|iwslt2017 |\n|kmmlu |\n|kmmlu_direct |\n|kmmlu_hard |\n|kmmlu_hard_cot |\n|kmmlu_hard_direct |\n|kobest |\n|lambada |\n|lambada_cloze |\n|lambada_multilingual |\n|lambada_multilingual_stablelm |\n|m_mmlu |\n|math_word_problems |\n|med_concepts_qa_atc_tasks |\n|med_concepts_qa_icd10cm_tasks |\n|med_concepts_qa_icd10proc_tasks |\n|med_concepts_qa_icd9cm_tasks |\n|med_concepts_qa_icd9proc_tasks |\n|mgsm_cot_native |\n|mgsm_direct |\n|mmlu_continuation_humanities |\n|mmlu_continuation_other |\n|mmlu_continuation_social_sciences |\n|mmlu_continuation_stem |\n|mmlu_flan_cot_fewshot_humanities |\n|mmlu_flan_cot_fewshot_other |\n|mmlu_flan_cot_fewshot_social_sciences |\n|mmlu_flan_cot_fewshot_stem |\n|mmlu_flan_cot_zeroshot_humanities |\n|mmlu_flan_cot_zeroshot_other |\n|mmlu_flan_cot_zeroshot_social_sciences |\n|mmlu_flan_cot_zeroshot_stem |\n|mmlu_flan_n_shot_generative_humanities |\n|mmlu_flan_n_shot_generative_other |\n|mmlu_flan_n_shot_generative_social_sciences |\n|mmlu_flan_n_shot_generative_stem |\n|mmlu_flan_n_shot_loglikelihood_humanities |\n|mmlu_flan_n_shot_loglikelihood_other |\n|mmlu_flan_n_shot_loglikelihood_social_sciences |\n|mmlu_flan_n_shot_loglikelihood_stem |\n|mmlu_humanities_generative |\n|mmlu_humanities_tasks |\n|mmlu_other_generative |\n|mmlu_other_tasks |\n|mmlu_social_sciences_generative |\n|mmlu_social_sciences_tasks |\n|mmlu_stem_generative |\n|mmlu_stem_tasks |\n|mmlusr_answer_only_humanities_tasks |\n|mmlusr_answer_only_other_tasks |\n|mmlusr_answer_only_social_sciences_tasks |\n|mmlusr_answer_only_stem_tasks |\n|mmlusr_question_and_answer_humanities_tasks |\n|mmlusr_question_and_answer_other_tasks |\n|mmlusr_question_and_answer_social_sciences_tasks|\n|mmlusr_question_and_answer_stem_tasks |\n|mmlusr_question_only_humanities_tasks |\n|mmlusr_question_only_other_tasks |\n|mmlusr_question_only_social_sciences_tasks |\n|mmlusr_question_only_stem_tasks |\n|multiple_choice |\n|paloma |\n|persona |\n|polemo2 |\n|qa4mre |\n|qasper |\n|self_consistency |\n|storycloze |\n|super-glue-lm-eval-v1 |\n|super-glue-lm-eval-v1-seq2seq |\n|super-glue-t5-prompt |\n|sycophancy |\n|tmlu_humanities_tasks |\n|tmlu_other_tasks |\n|tmlu_social_sciences_tasks |\n|tmlu_stem_tasks |\n|tmlu_taiwan_specific |\n|tmmluplus_STEM |\n|tmmluplus_humanities |\n|tmmluplus_other |\n|tmmluplus_social_sciences |\n|translation |\n|truthfulqa |\n|truthfulqa_multilingual |\n|unscramble |\n|wmt14 |\n|wmt16 |\n|xnli_eu_mt_native |\n\n\n| Task | Config Location | Output Type |\n|--------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------|---------------------|\n|aclue_ancient_chinese_culture |lm_eval/tasks/aclue/aclue_ancient_chinese_culture.yaml |multiple_choice |\n|aclue_ancient_literature |lm_eval/tasks/aclue/aclue_ancient_literature.yaml |multiple_choice |\n|aclue_ancient_medical |lm_eval/tasks/aclue/aclue_ancient_medical.yaml |multiple_choice |\n|aclue_ancient_phonetics |lm_eval/tasks/aclue/aclue_ancient_phonetics.yaml |multiple_choice |\n|aclue_basic_ancient_chinese |lm_eval/tasks/aclue/aclue_basic_ancient_chinese.yaml |multiple_choice |\n|aclue_couplet_prediction |lm_eval/tasks/aclue/aclue_couplet_prediction.yaml |multiple_choice |\n|aclue_homographic_character_resolution |lm_eval/tasks/aclue/aclue_homographic_character_resolution.yaml |multiple_choice |\n|aclue_named_entity_recognition |lm_eval/tasks/aclue/aclue_named_entity_recognition.yaml |multiple_choice |\n|aclue_poetry_appreciate |lm_eval/tasks/aclue/aclue_poetry_appreciate.yaml |multiple_choice |\n|aclue_poetry_context_prediction |lm_eval/tasks/aclue/aclue_poetry_context_prediction.yaml |multiple_choice |\n|aclue_poetry_quality_assessment |lm_eval/tasks/aclue/aclue_poetry_quality_assessment.yaml |multiple_choice |\n|aclue_poetry_sentiment_analysis |lm_eval/tasks/aclue/aclue_poetry_sentiment_analysis.yaml |multiple_choice |\n|aclue_polysemy_resolution |lm_eval/tasks/aclue/aclue_polysemy_resolution.yaml |multiple_choice |\n|aclue_reading_comprehension |lm_eval/tasks/aclue/aclue_reading_comprehension.yaml |multiple_choice |\n|aclue_sentence_segmentation |lm_eval/tasks/aclue/aclue_sentence_segmentation.yaml |multiple_choice |\n|advanced_ai_risk_fewshot-coordinate-itself |lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-coordinate-itself.yaml |multiple_choice |\n|advanced_ai_risk_fewshot-coordinate-other-ais |lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-coordinate-other-ais.yaml |multiple_choice |\n|advanced_ai_risk_fewshot-coordinate-other-versions |lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-coordinate-other-versions.yaml |multiple_choice |\n|advanced_ai_risk_fewshot-corrigible-less-HHH |lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-corrigible-less-HHH.yaml |multiple_choice |\n|advanced_ai_risk_fewshot-corrigible-more-HHH |lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-corrigible-more-HHH.yaml |multiple_choice |\n|advanced_ai_risk_fewshot-corrigible-neutral-HHH |lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-corrigible-neutral-HHH.yaml |multiple_choice |\n|advanced_ai_risk_fewshot-myopic-reward |lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-myopic-reward.yaml |multiple_choice |\n|advanced_ai_risk_fewshot-one-box-tendency |lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-one-box-tendency.yaml |multiple_choice |\n|advanced_ai_risk_fewshot-power-seeking-inclination |lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-power-seeking-inclination.yaml |multiple_choice |\n|advanced_ai_risk_fewshot-self-awareness-general-ai |lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-general-ai.yaml |multiple_choice |\n|advanced_ai_risk_fewshot-self-awareness-good-text-model |lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-good-text-model.yaml |multiple_choice |\n|advanced_ai_risk_fewshot-self-awareness-text-model |lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-text-model.yaml |multiple_choice |\n|advanced_ai_risk_fewshot-self-awareness-training-architecture |lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-training-architecture.yaml |multiple_choice |\n|advanced_ai_risk_fewshot-self-awareness-training-web-gpt |lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-self-awareness-training-web-gpt.yaml |multiple_choice |\n|advanced_ai_risk_fewshot-survival-instinct |lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-survival-instinct.yaml |multiple_choice |\n|advanced_ai_risk_fewshot-wealth-seeking-inclination |lm_eval/tasks/model_written_evals/advanced_ai_risk/fewshot-wealth-seeking-inclination.yaml |multiple_choice |\n|advanced_ai_risk_human-coordinate-itself |lm_eval/tasks/model_written_evals/advanced_ai_risk/human-coordinate-itself.yaml |multiple_choice |\n|advanced_ai_risk_human-coordinate-other-ais |lm_eval/tasks/model_written_evals/advanced_ai_risk/human-coordinate-other-ais.yaml |multiple_choice |\n|advanced_ai_risk_human-coordinate-other-versions |lm_eval/tasks/model_written_evals/advanced_ai_risk/human-coordinate-other-versions.yaml |multiple_choice |\n|advanced_ai_risk_human-corrigible-less-HHH |lm_eval/tasks/model_written_evals/advanced_ai_risk/human-corrigible-less-HHH.yaml |multiple_choice |\n|advanced_ai_risk_human-corrigible-more-HHH |lm_eval/tasks/model_written_evals/advanced_ai_risk/human-corrigible-more-HHH.yaml |multiple_choice |\n|advanced_ai_risk_human-corrigible-neutral-HHH |lm_eval/tasks/model_written_evals/advanced_ai_risk/human-corrigible-neutral-HHH.yaml |multiple_choice |\n|advanced_ai_risk_human-myopic-reward |lm_eval/tasks/model_written_evals/advanced_ai_risk/human-myopic-reward.yaml |multiple_choice |\n|advanced_ai_risk_human-one-box-tendency |lm_eval/tasks/model_written_evals/advanced_ai_risk/human-one-box-tendency.yaml |multiple_choice |\n|advanced_ai_risk_human-power-seeking-inclination |lm_eval/tasks/model_written_evals/advanced_ai_risk/human-power-seeking-inclination.yaml |multiple_choice |\n|advanced_ai_risk_human-self-awareness-general-ai |lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-general-ai.yaml |multiple_choice |\n|advanced_ai_risk_human-self-awareness-good-text-model |lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-good-text-model.yaml |multiple_choice |\n|advanced_ai_risk_human-self-awareness-text-model |lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-text-model.yaml |multiple_choice |\n|advanced_ai_risk_human-self-awareness-training-architecture |lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-training-architecture.yaml |multiple_choice |\n|advanced_ai_risk_human-self-awareness-web-gpt |lm_eval/tasks/model_written_evals/advanced_ai_risk/human-self-awareness-web-gpt.yaml |multiple_choice |\n|advanced_ai_risk_human-survival-instinct |lm_eval/tasks/model_written_evals/advanced_ai_risk/human-survival-instinct.yaml |multiple_choice |\n|advanced_ai_risk_human-wealth-seeking-inclination |lm_eval/tasks/model_written_evals/advanced_ai_risk/human-wealth-seeking-inclination.yaml |multiple_choice |\n|advanced_ai_risk_lm-coordinate-itself |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-coordinate-itself.yaml |multiple_choice |\n|advanced_ai_risk_lm-coordinate-other-ais |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-coordinate-other-ais.yaml |multiple_choice |\n|advanced_ai_risk_lm-coordinate-other-versions |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-coordinate-other-versions.yaml |multiple_choice |\n|advanced_ai_risk_lm-corrigible-less-HHH |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-corrigible-less-HHH.yaml |multiple_choice |\n|advanced_ai_risk_lm-corrigible-more-HHH |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-corrigible-more-HHH.yaml |multiple_choice |\n|advanced_ai_risk_lm-corrigible-neutral-HHH |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-corrigible-neutral-HHH.yaml |multiple_choice |\n|advanced_ai_risk_lm-myopic-reward |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-myopic-reward.yaml |multiple_choice |\n|advanced_ai_risk_lm-one-box-tendency |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-one-box-tendency.yaml |multiple_choice |\n|advanced_ai_risk_lm-power-seeking-inclination |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-power-seeking-inclination.yaml |multiple_choice |\n|advanced_ai_risk_lm-self-awareness-general-ai |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-general-ai.yaml |multiple_choice |\n|advanced_ai_risk_lm-self-awareness-good-text-model |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-good-text-model.yaml |multiple_choice |\n|advanced_ai_risk_lm-self-awareness-text-model |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-text-model.yaml |multiple_choice |\n|advanced_ai_risk_lm-self-awareness-training-architecture |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-training-architecture.yaml |multiple_choice |\n|advanced_ai_risk_lm-self-awareness-training-nn-architecture |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-training-nn-architecture.yaml |multiple_choice |\n|advanced_ai_risk_lm-self-awareness-training-web-gpt |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-self-awareness-training-web-gpt.yaml |multiple_choice |\n|advanced_ai_risk_lm-survival-instinct |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-survival-instinct.yaml |multiple_choice |\n|advanced_ai_risk_lm-wealth-seeking-inclination |lm_eval/tasks/model_written_evals/advanced_ai_risk/lm-wealth-seeking-inclination.yaml |multiple_choice |\n|aexams_Biology |lm_eval/tasks/aexams/aexams_Biology.yaml |multiple_choice |\n|aexams_IslamicStudies |lm_eval/tasks/aexams/aexams_IslamicStudies.yaml |multiple_choice |\n|aexams_Physics |lm_eval/tasks/aexams/aexams_Physics.yaml |multiple_choice |\n|aexams_Science |lm_eval/tasks/aexams/aexams_Science.yaml |multiple_choice |\n|aexams_Social |lm_eval/tasks/aexams/aexams_Social.yaml |multiple_choice |\n|afrimgsm_direct_amh |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_amh.yaml |generate_until |\n|afrimgsm_direct_eng |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_eng.yaml |generate_until |\n|afrimgsm_direct_ewe |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_ewe.yaml |generate_until |\n|afrimgsm_direct_fra |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_fra.yaml |generate_until |\n|afrimgsm_direct_hau |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_hau.yaml |generate_until |\n|afrimgsm_direct_ibo |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_ibo.yaml |generate_until |\n|afrimgsm_direct_kin |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_kin.yaml |generate_until |\n|afrimgsm_direct_lin |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_lin.yaml |generate_until |\n|afrimgsm_direct_lug |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_lug.yaml |generate_until |\n|afrimgsm_direct_orm |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_orm.yaml |generate_until |\n|afrimgsm_direct_sna |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_sna.yaml |generate_until |\n|afrimgsm_direct_sot |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_sot.yaml |generate_until |\n|afrimgsm_direct_swa |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_swa.yaml |generate_until |\n|afrimgsm_direct_twi |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_twi.yaml |generate_until |\n|afrimgsm_direct_wol |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_wol.yaml |generate_until |\n|afrimgsm_direct_xho |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_xho.yaml |generate_until |\n|afrimgsm_direct_yor |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_yor.yaml |generate_until |\n|afrimgsm_direct_zul |lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_zul.yaml |generate_until |\n|afrimgsm_en_cot_amh |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_amh.yaml |generate_until |\n|afrimgsm_en_cot_eng |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_eng.yaml |generate_until |\n|afrimgsm_en_cot_ewe |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_ewe.yaml |generate_until |\n|afrimgsm_en_cot_fra |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_fra.yaml |generate_until |\n|afrimgsm_en_cot_hau |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_hau.yaml |generate_until |\n|afrimgsm_en_cot_ibo |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_ibo.yaml |generate_until |\n|afrimgsm_en_cot_kin |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_kin.yaml |generate_until |\n|afrimgsm_en_cot_lin |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_lin.yaml |generate_until |\n|afrimgsm_en_cot_lug |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_lug.yaml |generate_until |\n|afrimgsm_en_cot_orm |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_orm.yaml |generate_until |\n|afrimgsm_en_cot_sna |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_sna.yaml |generate_until |\n|afrimgsm_en_cot_sot |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_sot.yaml |generate_until |\n|afrimgsm_en_cot_swa |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_swa.yaml |generate_until |\n|afrimgsm_en_cot_twi |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_twi.yaml |generate_until |\n|afrimgsm_en_cot_wol |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_wol.yaml |generate_until |\n|afrimgsm_en_cot_xho |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_xho.yaml |generate_until |\n|afrimgsm_en_cot_yor |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_yor.yaml |generate_until |\n|afrimgsm_en_cot_zul |lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_zul.yaml |generate_until |\n|afrimgsm_translate_direct_amh |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_amh.yaml |generate_until |\n|afrimgsm_translate_direct_eng |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_eng.yaml |generate_until |\n|afrimgsm_translate_direct_ewe |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_ewe.yaml |generate_until |\n|afrimgsm_translate_direct_fra |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_fra.yaml |generate_until |\n|afrimgsm_translate_direct_hau |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_hau.yaml |generate_until |\n|afrimgsm_translate_direct_ibo |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_ibo.yaml |generate_until |\n|afrimgsm_translate_direct_kin |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_kin.yaml |generate_until |\n|afrimgsm_translate_direct_lin |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_lin.yaml |generate_until |\n|afrimgsm_translate_direct_lug |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_lug.yaml |generate_until |\n|afrimgsm_translate_direct_orm |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_orm.yaml |generate_until |\n|afrimgsm_translate_direct_sna |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_sna.yaml |generate_until |\n|afrimgsm_translate_direct_sot |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_sot.yaml |generate_until |\n|afrimgsm_translate_direct_swa |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_swa.yaml |generate_until |\n|afrimgsm_translate_direct_twi |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_twi.yaml |generate_until |\n|afrimgsm_translate_direct_wol |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_wol.yaml |generate_until |\n|afrimgsm_translate_direct_xho |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_xho.yaml |generate_until |\n|afrimgsm_translate_direct_yor |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_yor.yaml |generate_until |\n|afrimgsm_translate_direct_zul |lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_zul.yaml |generate_until |\n|afrimmlu_direct_amh |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_amh.yaml |multiple_choice |\n|afrimmlu_direct_eng |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_eng.yaml |multiple_choice |\n|afrimmlu_direct_ewe |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ewe.yaml |multiple_choice |\n|afrimmlu_direct_fra |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_fra.yaml |multiple_choice |\n|afrimmlu_direct_hau |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_hau.yaml |multiple_choice |\n|afrimmlu_direct_ibo |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ibo.yaml |multiple_choice |\n|afrimmlu_direct_kin |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_kin.yaml |multiple_choice |\n|afrimmlu_direct_lin |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lin.yaml |multiple_choice |\n|afrimmlu_direct_lug |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lug.yaml |multiple_choice |\n|afrimmlu_direct_orm |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_orm.yaml |multiple_choice |\n|afrimmlu_direct_sna |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sna.yaml |multiple_choice |\n|afrimmlu_direct_sot |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sot.yaml |multiple_choice |\n|afrimmlu_direct_swa |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_swa.yaml |multiple_choice |\n|afrimmlu_direct_twi |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_twi.yaml |multiple_choice |\n|afrimmlu_direct_wol |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_wol.yaml |multiple_choice |\n|afrimmlu_direct_xho |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_xho.yaml |multiple_choice |\n|afrimmlu_direct_yor |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_yor.yaml |multiple_choice |\n|afrimmlu_direct_zul |lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_zul.yaml |multiple_choice |\n|afrimmlu_translate_amh |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_amh.yaml |multiple_choice |\n|afrimmlu_translate_eng |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_eng.yaml |multiple_choice |\n|afrimmlu_translate_ewe |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ewe.yaml |multiple_choice |\n|afrimmlu_translate_fra |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_fra.yaml |multiple_choice |\n|afrimmlu_translate_hau |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_hau.yaml |multiple_choice |\n|afrimmlu_translate_ibo |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ibo.yaml |multiple_choice |\n|afrimmlu_translate_kin |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_kin.yaml |multiple_choice |\n|afrimmlu_translate_lin |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lin.yaml |multiple_choice |\n|afrimmlu_translate_lug |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lug.yaml |multiple_choice |\n|afrimmlu_translate_orm |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_orm.yaml |multiple_choice |\n|afrimmlu_translate_sna |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sna.yaml |multiple_choice |\n|afrimmlu_translate_sot |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sot.yaml |multiple_choice |\n|afrimmlu_translate_swa |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_swa.yaml |multiple_choice |\n|afrimmlu_translate_twi |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_twi.yaml |multiple_choice |\n|afrimmlu_translate_wol |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_wol.yaml |multiple_choice |\n|afrimmlu_translate_xho |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_xho.yaml |multiple_choice |\n|afrimmlu_translate_yor |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_yor.yaml |multiple_choice |\n|afrimmlu_translate_zul |lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_zul.yaml |multiple_choice |\n|afrixnli_en_direct_amh |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_amh.yaml |multiple_choice |\n|afrixnli_en_direct_eng |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_eng.yaml |multiple_choice |\n|afrixnli_en_direct_ewe |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_ewe.yaml |multiple_choice |\n|afrixnli_en_direct_fra |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_fra.yaml |multiple_choice |\n|afrixnli_en_direct_hau |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_hau.yaml |multiple_choice |\n|afrixnli_en_direct_ibo |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_ibo.yaml |multiple_choice |\n|afrixnli_en_direct_kin |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_kin.yaml |multiple_choice |\n|afrixnli_en_direct_lin |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_lin.yaml |multiple_choice |\n|afrixnli_en_direct_lug |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_lug.yaml |multiple_choice |\n|afrixnli_en_direct_orm |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_orm.yaml |multiple_choice |\n|afrixnli_en_direct_sna |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_sna.yaml |multiple_choice |\n|afrixnli_en_direct_sot |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_sot.yaml |multiple_choice |\n|afrixnli_en_direct_swa |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_swa.yaml |multiple_choice |\n|afrixnli_en_direct_twi |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_twi.yaml |multiple_choice |\n|afrixnli_en_direct_wol |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_wol.yaml |multiple_choice |\n|afrixnli_en_direct_xho |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_xho.yaml |multiple_choice |\n|afrixnli_en_direct_yor |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_yor.yaml |multiple_choice |\n|afrixnli_en_direct_zul |lm_eval/tasks/afrixnli/anli prompt/en-direct/afrixnli_en_direct_zul.yaml |multiple_choice |\n|afrixnli_manual_direct_amh |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_amh.yaml |multiple_choice |\n|afrixnli_manual_direct_eng |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_eng.yaml |multiple_choice |\n|afrixnli_manual_direct_ewe |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_ewe.yaml |multiple_choice |\n|afrixnli_manual_direct_fra |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_fra.yaml |multiple_choice |\n|afrixnli_manual_direct_hau |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_hau.yaml |multiple_choice |\n|afrixnli_manual_direct_ibo |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_ibo.yaml |multiple_choice |\n|afrixnli_manual_direct_kin |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_kin.yaml |multiple_choice |\n|afrixnli_manual_direct_lin |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_lin.yaml |multiple_choice |\n|afrixnli_manual_direct_lug |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_lug.yaml |multiple_choice |\n|afrixnli_manual_direct_orm |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_orm.yaml |multiple_choice |\n|afrixnli_manual_direct_sna |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_sna.yaml |multiple_choice |\n|afrixnli_manual_direct_sot |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_sot.yaml |multiple_choice |\n|afrixnli_manual_direct_swa |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_swa.yaml |multiple_choice |\n|afrixnli_manual_direct_twi |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_twi.yaml |multiple_choice |\n|afrixnli_manual_direct_wol |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_wol.yaml |multiple_choice |\n|afrixnli_manual_direct_xho |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_xho.yaml |multiple_choice |\n|afrixnli_manual_direct_yor |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_yor.yaml |multiple_choice |\n|afrixnli_manual_direct_zul |lm_eval/tasks/afrixnli/lai prompt/direct/afrixnli_manual_direct_zul.yaml |multiple_choice |\n|afrixnli_manual_translate_amh |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_amh.yaml |multiple_choice |\n|afrixnli_manual_translate_ewe |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_ewe.yaml |multiple_choice |\n|afrixnli_manual_translate_fra |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_fra.yaml |multiple_choice |\n|afrixnli_manual_translate_hau |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_hau.yaml |multiple_choice |\n|afrixnli_manual_translate_ibo |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_ibo.yaml |multiple_choice |\n|afrixnli_manual_translate_kin |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_kin.yaml |multiple_choice |\n|afrixnli_manual_translate_lin |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_lin.yaml |multiple_choice |\n|afrixnli_manual_translate_lug |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_lug.yaml |multiple_choice |\n|afrixnli_manual_translate_orm |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_orm.yaml |multiple_choice |\n|afrixnli_manual_translate_sna |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_sna.yaml |multiple_choice |\n|afrixnli_manual_translate_sot |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_sot.yaml |multiple_choice |\n|afrixnli_manual_translate_swa |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_swa.yaml |multiple_choice |\n|afrixnli_manual_translate_twi |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_twi.yaml |multiple_choice |\n|afrixnli_manual_translate_wol |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_wol.yaml |multiple_choice |\n|afrixnli_manual_translate_xho |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_xho.yaml |multiple_choice |\n|afrixnli_manual_translate_yor |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_yor.yaml |multiple_choice |\n|afrixnli_manual_translate_zul |lm_eval/tasks/afrixnli/lai prompt/translate/afrixnli_manual_translate_zul.yaml |multiple_choice |\n|afrixnli_native_direct_amh |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_amh.yaml |multiple_choice |\n|afrixnli_native_direct_eng |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_eng.yaml |multiple_choice |\n|afrixnli_native_direct_ewe |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_ewe.yaml |multiple_choice |\n|afrixnli_native_direct_fra |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_fra.yaml |multiple_choice |\n|afrixnli_native_direct_hau |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_hau.yaml |multiple_choice |\n|afrixnli_native_direct_ibo |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_ibo.yaml |multiple_choice |\n|afrixnli_native_direct_kin |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_kin.yaml |multiple_choice |\n|afrixnli_native_direct_lin |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_lin.yaml |multiple_choice |\n|afrixnli_native_direct_lug |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_lug.yaml |multiple_choice |\n|afrixnli_native_direct_orm |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_orm.yaml |multiple_choice |\n|afrixnli_native_direct_sna |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_sna.yaml |multiple_choice |\n|afrixnli_native_direct_sot |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_sot.yaml |multiple_choice |\n|afrixnli_native_direct_swa |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_swa.yaml |multiple_choice |\n|afrixnli_native_direct_twi |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_twi.yaml |multiple_choice |\n|afrixnli_native_direct_wol |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_wol.yaml |multiple_choice |\n|afrixnli_native_direct_xho |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_xho.yaml |multiple_choice |\n|afrixnli_native_direct_yor |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_yor.yaml |multiple_choice |\n|afrixnli_native_direct_zul |lm_eval/tasks/afrixnli/anli prompt/native-direct/afrixnli_native_direct_zul.yaml |multiple_choice |\n|afrixnli_translate_amh |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_amh.yaml |multiple_choice |\n|afrixnli_translate_ewe |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_ewe.yaml |multiple_choice |\n|afrixnli_translate_fra |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_fra.yaml |multiple_choice |\n|afrixnli_translate_hau |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_hau.yaml |multiple_choice |\n|afrixnli_translate_ibo |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_ibo.yaml |multiple_choice |\n|afrixnli_translate_kin |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_kin.yaml |multiple_choice |\n|afrixnli_translate_lin |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_lin.yaml |multiple_choice |\n|afrixnli_translate_lug |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_lug.yaml |multiple_choice |\n|afrixnli_translate_orm |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_orm.yaml |multiple_choice |\n|afrixnli_translate_sna |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_sna.yaml |multiple_choice |\n|afrixnli_translate_sot |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_sot.yaml |multiple_choice |\n|afrixnli_translate_swa |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_swa.yaml |multiple_choice |\n|afrixnli_translate_twi |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_twi.yaml |multiple_choice |\n|afrixnli_translate_wol |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_wol.yaml |multiple_choice |\n|afrixnli_translate_xho |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_xho.yaml |multiple_choice |\n|afrixnli_translate_yor |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_yor.yaml |multiple_choice |\n|afrixnli_translate_zul |lm_eval/tasks/afrixnli/anli prompt/translate/afrixnli_translate_zul.yaml |multiple_choice |\n|agieval_aqua_rat |lm_eval/tasks/agieval/aqua-rat.yaml |multiple_choice |\n|agieval_gaokao_biology |lm_eval/tasks/agieval/gaokao-biology.yaml |multiple_choice |\n|agieval_gaokao_chemistry |lm_eval/tasks/agieval/gaokao-chemistry.yaml |multiple_choice |\n|agieval_gaokao_chinese |lm_eval/tasks/agieval/gaokao-chinese.yaml |multiple_choice |\n|agieval_gaokao_english |lm_eval/tasks/agieval/gaokao-english.yaml |multiple_choice |\n|agieval_gaokao_geography |lm_eval/tasks/agieval/gaokao-geography.yaml |multiple_choice |\n|agieval_gaokao_history |lm_eval/tasks/agieval/gaokao-history.yaml |multiple_choice |\n|agieval_gaokao_mathcloze |lm_eval/tasks/agieval/gaokao-mathcloze.yaml |generate_until |\n|agieval_gaokao_mathqa |lm_eval/tasks/agieval/gaokao-mathqa.yaml |multiple_choice |\n|agieval_gaokao_physics |lm_eval/tasks/agieval/gaokao-physics.yaml |multiple_choice |\n|agieval_jec_qa_ca |lm_eval/tasks/agieval/jec-qa-ca.yaml |multiple_choice |\n|agieval_jec_qa_kd |lm_eval/tasks/agieval/jec-qa-kd.yaml |multiple_choice |\n|agieval_logiqa_en |lm_eval/tasks/agieval/logiqa-en.yaml |multiple_choice |\n|agieval_logiqa_zh |lm_eval/tasks/agieval/logiqa-zh.yaml |multiple_choice |\n|agieval_lsat_ar |lm_eval/tasks/agieval/lsat-ar.yaml |multiple_choice |\n|agieval_lsat_lr |lm_eval/tasks/agieval/lsat-lr.yaml |multiple_choice |\n|agieval_lsat_rc |lm_eval/tasks/agieval/lsat-rc.yaml |multiple_choice |\n|agieval_math |lm_eval/tasks/agieval/math.yaml |generate_until |\n|agieval_sat_en |lm_eval/tasks/agieval/sat-en.yaml |multiple_choice |\n|agieval_sat_en_without_passage |lm_eval/tasks/agieval/sat-en-without-passage.yaml |multiple_choice |\n|agieval_sat_math |lm_eval/tasks/agieval/sat-math.yaml |multiple_choice |\n|anagrams1 |lm_eval/tasks/unscramble/anagrams1.yaml |generate_until |\n|anagrams2 |lm_eval/tasks/unscramble/anagrams2.yaml |generate_until |\n|anli_r1 |lm_eval/tasks/anli/anli_r1.yaml |multiple_choice |\n|anli_r2 |lm_eval/tasks/anli/anli_r2.yaml |multiple_choice |\n|anli_r3 |lm_eval/tasks/anli/anli_r3.yaml |multiple_choice |\n|arabicmmlu_arabic_language_(general) |lm_eval/tasks/arabicmmlu/arabicmmlu_arabic_language_general.yaml |multiple_choice |\n|arabicmmlu_arabic_language_(grammar) |lm_eval/tasks/arabicmmlu/arabicmmlu_arabic_language_grammar.yaml |multiple_choice |\n|arabicmmlu_driving_test |lm_eval/tasks/arabicmmlu/arabicmmlu_driving_test.yaml |multiple_choice |\n|arabicmmlu_general_knowledge |lm_eval/tasks/arabicmmlu/arabicmmlu_general_knowledge.yaml |multiple_choice |\n|arabicmmlu_high_arabic_language |lm_eval/tasks/arabicmmlu/arabicmmlu_high_arabic_language.yaml |multiple_choice |\n|arabicmmlu_high_biology |lm_eval/tasks/arabicmmlu/arabicmmlu_high_biology.yaml |multiple_choice |\n|arabicmmlu_high_civics |lm_eval/tasks/arabicmmlu/arabicmmlu_high_civics.yaml |multiple_choice |\n|arabicmmlu_high_computer_science |lm_eval/tasks/arabicmmlu/arabicmmlu_high_computer_science.yaml |multiple_choice |\n|arabicmmlu_high_economics |lm_eval/tasks/arabicmmlu/arabicmmlu_high_economics.yaml |multiple_choice |\n|arabicmmlu_high_geography |lm_eval/tasks/arabicmmlu/arabicmmlu_high_geography.yaml |multiple_choice |\n|arabicmmlu_high_history |lm_eval/tasks/arabicmmlu/arabicmmlu_high_history.yaml |multiple_choice |\n|arabicmmlu_high_islamic_studies |lm_eval/tasks/arabicmmlu/arabicmmlu_high_islamic_studies.yaml |multiple_choice |\n|arabicmmlu_high_philosophy |lm_eval/tasks/arabicmmlu/arabicmmlu_high_philosophy.yaml |multiple_choice |\n|arabicmmlu_high_physics |lm_eval/tasks/arabicmmlu/arabicmmlu_high_physics.yaml |multiple_choice |\n|arabicmmlu_islamic_studies |lm_eval/tasks/arabicmmlu/arabicmmlu_islamic_studies.yaml |multiple_choice |\n|arabicmmlu_middle_arabic_language |lm_eval/tasks/arabicmmlu/arabicmmlu_middle_arabic_language.yaml |multiple_choice |\n|arabicmmlu_middle_civics |lm_eval/tasks/arabicmmlu/arabicmmlu_middle_civics.yaml |multiple_choice |\n|arabicmmlu_middle_computer_science |lm_eval/tasks/arabicmmlu/arabicmmlu_middle_computer_science.yaml |multiple_choice |\n|arabicmmlu_middle_economics |lm_eval/tasks/arabicmmlu/arabicmmlu_middle_economics.yaml |multiple_choice |\n|arabicmmlu_middle_general_knowledge |lm_eval/tasks/arabicmmlu/arabicmmlu_middle_general_knowledge.yaml |multiple_choice |\n|arabicmmlu_middle_geography |lm_eval/tasks/arabicmmlu/arabicmmlu_middle_geography.yaml |multiple_choice |\n|arabicmmlu_middle_history |lm_eval/tasks/arabicmmlu/arabicmmlu_middle_history.yaml |multiple_choice |\n|arabicmmlu_middle_islamic_studies |lm_eval/tasks/arabicmmlu/arabicmmlu_middle_islamic_studies.yaml |multiple_choice |\n|arabicmmlu_middle_natural_science |lm_eval/tasks/arabicmmlu/arabicmmlu_middle_natural_science.yaml |multiple_choice |\n|arabicmmlu_middle_social_science |lm_eval/tasks/arabicmmlu/arabicmmlu_middle_social_science.yaml |multiple_choice |\n|arabicmmlu_primary_arabic_language |lm_eval/tasks/arabicmmlu/arabicmmlu_primary_arabic_language.yaml |multiple_choice |\n|arabicmmlu_primary_computer_science |lm_eval/tasks/arabicmmlu/arabicmmlu_primary_computer_science.yaml |multiple_choice |\n|arabicmmlu_primary_general_knowledge |lm_eval/tasks/arabicmmlu/arabicmmlu_primary_general_knowledge.yaml |multiple_choice |\n|arabicmmlu_primary_geography |lm_eval/tasks/arabicmmlu/arabicmmlu_primary_geography.yaml |multiple_choice |\n|arabicmmlu_primary_history |lm_eval/tasks/arabicmmlu/arabicmmlu_primary_history.yaml |multiple_choice |\n|arabicmmlu_primary_islamic_studies |lm_eval/tasks/arabicmmlu/arabicmmlu_primary_islamic_studies.yaml |multiple_choice |\n|arabicmmlu_primary_math |lm_eval/tasks/arabicmmlu/arabicmmlu_primary_math.yaml |multiple_choice |\n|arabicmmlu_primary_natural_science |lm_eval/tasks/arabicmmlu/arabicmmlu_primary_natural_science.yaml |multiple_choice |\n|arabicmmlu_primary_social_science |lm_eval/tasks/arabicmmlu/arabicmmlu_primary_social_science.yaml |multiple_choice |\n|arabicmmlu_prof_law |lm_eval/tasks/arabicmmlu/arabicmmlu_prof_law.yaml |multiple_choice |\n|arabicmmlu_univ_accounting |lm_eval/tasks/arabicmmlu/arabicmmlu_univ_accounting.yaml |multiple_choice |\n|arabicmmlu_univ_computer_science |lm_eval/tasks/arabicmmlu/arabicmmlu_univ_computer_science.yaml |multiple_choice |\n|arabicmmlu_univ_economics |lm_eval/tasks/arabicmmlu/arabicmmlu_univ_economics.yaml |multiple_choice |\n|arabicmmlu_univ_management |lm_eval/tasks/arabicmmlu/arabicmmlu_univ_management.yaml |multiple_choice |\n|arabicmmlu_univ_political_science |lm_eval/tasks/arabicmmlu/arabicmmlu_univ_political_science.yaml |multiple_choice |\n|arc_ar |lm_eval/tasks/okapi/arc_multilingual/arc_ar.yaml |multiple_choice |\n|arc_bn |lm_eval/tasks/okapi/arc_multilingual/arc_bn.yaml |multiple_choice |\n|arc_ca |lm_eval/tasks/okapi/arc_multilingual/arc_ca.yaml |multiple_choice |\n|arc_challenge |lm_eval/tasks/arc/arc_challenge.yaml |multiple_choice |\n|arc_challenge_mt_da |lm_eval/tasks/arc_mt/arc_challenge_mt_da.yaml |multiple_choice |\n|arc_challenge_mt_de |lm_eval/tasks/arc_mt/arc_challenge_mt_de.yaml |multiple_choice |\n|arc_challenge_mt_el |lm_eval/tasks/arc_mt/arc_challenge_mt_el.yaml |multiple_choice |\n|arc_challenge_mt_es |lm_eval/tasks/arc_mt/arc_challenge_mt_es.yaml |multiple_choice |\n|arc_challenge_mt_fi |lm_eval/tasks/arc_mt/arc_challenge_mt_fi.yaml |multiple_choice |\n|arc_challenge_mt_hu |lm_eval/tasks/arc_mt/arc_challenge_mt_hu.yaml |multiple_choice |\n|arc_challenge_mt_is |lm_eval/tasks/arc_mt/arc_challenge_mt_is.yaml |multiple_choice |\n|arc_challenge_mt_it |lm_eval/tasks/arc_mt/arc_challenge_mt_it.yaml |multiple_choice |\n|arc_challenge_mt_nb |lm_eval/tasks/arc_mt/arc_challenge_mt_nb.yaml |multiple_choice |\n|arc_challenge_mt_pl |lm_eval/tasks/arc_mt/arc_challenge_mt_pl.yaml |multiple_choice |\n|arc_challenge_mt_pt |lm_eval/tasks/arc_mt/arc_challenge_mt_pt.yaml |multiple_choice |\n|arc_challenge_mt_sv |lm_eval/tasks/arc_mt/arc_challenge_mt_sv.yaml |multiple_choice |\n|arc_da |lm_eval/tasks/okapi/arc_multilingual/arc_da.yaml |multiple_choice |\n|arc_de |lm_eval/tasks/okapi/arc_multilingual/arc_de.yaml |multiple_choice |\n|arc_easy |lm_eval/tasks/arc/arc_easy.yaml |multiple_choice |\n|arc_es |lm_eval/tasks/okapi/arc_multilingual/arc_es.yaml |multiple_choice |\n|arc_eu |lm_eval/tasks/okapi/arc_multilingual/arc_eu.yaml |multiple_choice |\n|arc_fr |lm_eval/tasks/okapi/arc_multilingual/arc_fr.yaml |multiple_choice |\n|arc_gu |lm_eval/tasks/okapi/arc_multilingual/arc_gu.yaml |multiple_choice |\n|arc_hi |lm_eval/tasks/okapi/arc_multilingual/arc_hi.yaml |multiple_choice |\n|arc_hr |lm_eval/tasks/okapi/arc_multilingual/arc_hr.yaml |multiple_choice |\n|arc_hu |lm_eval/tasks/okapi/arc_multilingual/arc_hu.yaml |multiple_choice |\n|arc_hy |lm_eval/tasks/okapi/arc_multilingual/arc_hy.yaml |multiple_choice |\n|arc_id |lm_eval/tasks/okapi/arc_multilingual/arc_id.yaml |multiple_choice |\n|arc_it |lm_eval/tasks/okapi/arc_multilingual/arc_it.yaml |multiple_choice |\n|arc_kn |lm_eval/tasks/okapi/arc_multilingual/arc_kn.yaml |multiple_choice |\n|arc_ml |lm_eval/tasks/okapi/arc_multilingual/arc_ml.yaml |multiple_choice |\n|arc_mr |lm_eval/tasks/okapi/arc_multilingual/arc_mr.yaml |multiple_choice |\n|arc_ne |lm_eval/tasks/okapi/arc_multilingual/arc_ne.yaml |multiple_choice |\n|arc_nl |lm_eval/tasks/okapi/arc_multilingual/arc_nl.yaml |multiple_choice |\n|arc_pt |lm_eval/tasks/okapi/arc_multilingual/arc_pt.yaml |multiple_choice |\n|arc_ro |lm_eval/tasks/okapi/arc_multilingual/arc_ro.yaml |multiple_choice |\n|arc_ru |lm_eval/tasks/okapi/arc_multilingual/arc_ru.yaml |multiple_choice |\n|arc_sk |lm_eval/tasks/okapi/arc_multilingual/arc_sk.yaml |multiple_choice |\n|arc_sr |lm_eval/tasks/okapi/arc_multilingual/arc_sr.yaml |multiple_choice |\n|arc_sv |lm_eval/tasks/okapi/arc_multilingual/arc_sv.yaml |multiple_choice |\n|arc_ta |lm_eval/tasks/okapi/arc_multilingual/arc_ta.yaml |multiple_choice |\n|arc_te |lm_eval/tasks/okapi/arc_multilingual/arc_te.yaml |multiple_choice |\n|arc_uk |lm_eval/tasks/okapi/arc_multilingual/arc_uk.yaml |multiple_choice |\n|arc_vi |lm_eval/tasks/okapi/arc_multilingual/arc_vi.yaml |multiple_choice |\n|arc_zh |lm_eval/tasks/okapi/arc_multilingual/arc_zh.yaml |multiple_choice |\n|arithmetic_1dc |lm_eval/tasks/arithmetic/arithmetic_1dc.yaml |loglikelihood |\n|arithmetic_2da |lm_eval/tasks/arithmetic/arithmetic_2da.yaml |loglikelihood |\n|arithmetic_2dm |lm_eval/tasks/arithmetic/arithmetic_2dm.yaml |loglikelihood |\n|arithmetic_2ds |lm_eval/tasks/arithmetic/arithmetic_2ds.yaml |loglikelihood |\n|arithmetic_3da |lm_eval/tasks/arithmetic/arithmetic_3da.yaml |loglikelihood |\n|arithmetic_3ds |lm_eval/tasks/arithmetic/arithmetic_3ds.yaml |loglikelihood |\n|arithmetic_4da |lm_eval/tasks/arithmetic/arithmetic_4da.yaml |loglikelihood |\n|arithmetic_4ds |lm_eval/tasks/arithmetic/arithmetic_4ds.yaml |loglikelihood |\n|arithmetic_5da |lm_eval/tasks/arithmetic/arithmetic_5da.yaml |loglikelihood |\n|arithmetic_5ds |lm_eval/tasks/arithmetic/arithmetic_5ds.yaml |loglikelihood |\n|asdiv |lm_eval/tasks/asdiv/default.yaml |loglikelihood |\n|asdiv_cot_llama |lm_eval/tasks/asdiv/asdiv-cot-llama.yaml |generate_until |\n|babi |lm_eval/tasks/babi/babi.yaml |generate_until |\n|bbh_cot_fewshot_boolean_expressions |lm_eval/tasks/bbh/cot_fewshot/boolean_expressions.yaml |generate_until |\n|bbh_cot_fewshot_causal_judgement |lm_eval/tasks/bbh/cot_fewshot/causal_judgement.yaml |generate_until |\n|bbh_cot_fewshot_date_understanding |lm_eval/tasks/bbh/cot_fewshot/date_understanding.yaml |generate_until |\n|bbh_cot_fewshot_disambiguation_qa |lm_eval/tasks/bbh/cot_fewshot/disambiguation_qa.yaml |generate_until |\n|bbh_cot_fewshot_dyck_languages |lm_eval/tasks/bbh/cot_fewshot/dyck_languages.yaml |generate_until |\n|bbh_cot_fewshot_formal_fallacies |lm_eval/tasks/bbh/cot_fewshot/formal_fallacies.yaml |generate_until |\n|bbh_cot_fewshot_geometric_shapes |lm_eval/tasks/bbh/cot_fewshot/geometric_shapes.yaml |generate_until |\n|bbh_cot_fewshot_hyperbaton |lm_eval/tasks/bbh/cot_fewshot/hyperbaton.yaml |generate_until |\n|bbh_cot_fewshot_logical_deduction_five_objects |lm_eval/tasks/bbh/cot_fewshot/logical_deduction_five_objects.yaml |generate_until |\n|bbh_cot_fewshot_logical_deduction_seven_objects |lm_eval/tasks/bbh/cot_fewshot/logical_deduction_seven_objects.yaml |generate_until |\n|bbh_cot_fewshot_logical_deduction_three_objects |lm_eval/tasks/bbh/cot_fewshot/logical_deduction_three_objects.yaml |generate_until |\n|bbh_cot_fewshot_movie_recommendation |lm_eval/tasks/bbh/cot_fewshot/movie_recommendation.yaml |generate_until |\n|bbh_cot_fewshot_multistep_arithmetic_two |lm_eval/tasks/bbh/cot_fewshot/multistep_arithmetic_two.yaml |generate_until |\n|bbh_cot_fewshot_navigate |lm_eval/tasks/bbh/cot_fewshot/navigate.yaml |generate_until |\n|bbh_cot_fewshot_object_counting |lm_eval/tasks/bbh/cot_fewshot/object_counting.yaml |generate_until |\n|bbh_cot_fewshot_penguins_in_a_table |lm_eval/tasks/bbh/cot_fewshot/penguins_in_a_table.yaml |generate_until |\n|bbh_cot_fewshot_reasoning_about_colored_objects |lm_eval/tasks/bbh/cot_fewshot/reasoning_about_colored_objects.yaml |generate_until |\n|bbh_cot_fewshot_ruin_names |lm_eval/tasks/bbh/cot_fewshot/ruin_names.yaml |generate_until |\n|bbh_cot_fewshot_salient_translation_error_detection |lm_eval/tasks/bbh/cot_fewshot/salient_translation_error_detection.yaml |generate_until |\n|bbh_cot_fewshot_snarks |lm_eval/tasks/bbh/cot_fewshot/snarks.yaml |generate_until |\n|bbh_cot_fewshot_sports_understanding |lm_eval/tasks/bbh/cot_fewshot/sports_understanding.yaml |generate_until |\n|bbh_cot_fewshot_temporal_sequences |lm_eval/tasks/bbh/cot_fewshot/temporal_sequences.yaml |generate_until |\n|bbh_cot_fewshot_tracking_shuffled_objects_five_objects |lm_eval/tasks/bbh/cot_fewshot/tracking_shuffled_objects_five_objects.yaml |generate_until |\n|bbh_cot_fewshot_tracking_shuffled_objects_seven_objects |lm_eval/tasks/bbh/cot_fewshot/tracking_shuffled_objects_seven_objects.yaml |generate_until |\n|bbh_cot_fewshot_tracking_shuffled_objects_three_objects |lm_eval/tasks/bbh/cot_fewshot/tracking_shuffled_objects_three_objects.yaml |generate_until |\n|bbh_cot_fewshot_web_of_lies |lm_eval/tasks/bbh/cot_fewshot/web_of_lies.yaml |generate_until |\n|bbh_cot_fewshot_word_sorting |lm_eval/tasks/bbh/cot_fewshot/word_sorting.yaml |generate_until |\n|bbh_cot_zeroshot_boolean_expressions |lm_eval/tasks/bbh/cot_zeroshot/boolean_expressions.yaml |generate_until |\n|bbh_cot_zeroshot_causal_judgement |lm_eval/tasks/bbh/cot_zeroshot/causal_judgement.yaml |generate_until |\n|bbh_cot_zeroshot_date_understanding |lm_eval/tasks/bbh/cot_zeroshot/date_understanding.yaml |generate_until |\n|bbh_cot_zeroshot_disambiguation_qa |lm_eval/tasks/bbh/cot_zeroshot/disambiguation_qa.yaml |generate_until |\n|bbh_cot_zeroshot_dyck_languages |lm_eval/tasks/bbh/cot_zeroshot/dyck_languages.yaml |generate_until |\n|bbh_cot_zeroshot_formal_fallacies |lm_eval/tasks/bbh/cot_zeroshot/formal_fallacies.yaml |generate_until |\n|bbh_cot_zeroshot_geometric_shapes |lm_eval/tasks/bbh/cot_zeroshot/geometric_shapes.yaml |generate_until |\n|bbh_cot_zeroshot_hyperbaton |lm_eval/tasks/bbh/cot_zeroshot/hyperbaton.yaml |generate_until |\n|bbh_cot_zeroshot_logical_deduction_five_objects |lm_eval/tasks/bbh/cot_zeroshot/logical_deduction_five_objects.yaml |generate_until |\n|bbh_cot_zeroshot_logical_deduction_seven_objects |lm_eval/tasks/bbh/cot_zeroshot/logical_deduction_seven_objects.yaml |generate_until |\n|bbh_cot_zeroshot_logical_deduction_three_objects |lm_eval/tasks/bbh/cot_zeroshot/logical_deduction_three_objects.yaml |generate_until |\n|bbh_cot_zeroshot_movie_recommendation |lm_eval/tasks/bbh/cot_zeroshot/movie_recommendation.yaml |generate_until |\n|bbh_cot_zeroshot_multistep_arithmetic_two |lm_eval/tasks/bbh/cot_zeroshot/multistep_arithmetic_two.yaml |generate_until |\n|bbh_cot_zeroshot_navigate |lm_eval/tasks/bbh/cot_zeroshot/navigate.yaml |generate_until |\n|bbh_cot_zeroshot_object_counting |lm_eval/tasks/bbh/cot_zeroshot/object_counting.yaml |generate_until |\n|bbh_cot_zeroshot_penguins_in_a_table |lm_eval/tasks/bbh/cot_zeroshot/penguins_in_a_table.yaml |generate_until |\n|bbh_cot_zeroshot_reasoning_about_colored_objects |lm_eval/tasks/bbh/cot_zeroshot/reasoning_about_colored_objects.yaml |generate_until |\n|bbh_cot_zeroshot_ruin_names |lm_eval/tasks/bbh/cot_zeroshot/ruin_names.yaml |generate_until |\n|bbh_cot_zeroshot_salient_translation_error_detection |lm_eval/tasks/bbh/cot_zeroshot/salient_translation_error_detection.yaml |generate_until |\n|bbh_cot_zeroshot_snarks |lm_eval/tasks/bbh/cot_zeroshot/snarks.yaml |generate_until |\n|bbh_cot_zeroshot_sports_understanding |lm_eval/tasks/bbh/cot_zeroshot/sports_understanding.yaml |generate_until |\n|bbh_cot_zeroshot_temporal_sequences |lm_eval/tasks/bbh/cot_zeroshot/temporal_sequences.yaml |generate_until |\n|bbh_cot_zeroshot_tracking_shuffled_objects_five_objects |lm_eval/tasks/bbh/cot_zeroshot/tracking_shuffled_objects_five_objects.yaml |generate_until |\n|bbh_cot_zeroshot_tracking_shuffled_objects_seven_objects |lm_eval/tasks/bbh/cot_zeroshot/tracking_shuffled_objects_seven_objects.yaml |generate_until |\n|bbh_cot_zeroshot_tracking_shuffled_objects_three_objects |lm_eval/tasks/bbh/cot_zeroshot/tracking_shuffled_objects_three_objects.yaml |generate_until |\n|bbh_cot_zeroshot_web_of_lies |lm_eval/tasks/bbh/cot_zeroshot/web_of_lies.yaml |generate_until |\n|bbh_cot_zeroshot_word_sorting |lm_eval/tasks/bbh/cot_zeroshot/word_sorting.yaml |generate_until |\n|bbh_fewshot_boolean_expressions |lm_eval/tasks/bbh/fewshot/boolean_expressions.yaml |generate_until |\n|bbh_fewshot_causal_judgement |lm_eval/tasks/bbh/fewshot/causal_judgement.yaml |generate_until |\n|bbh_fewshot_date_understanding |lm_eval/tasks/bbh/fewshot/date_understanding.yaml |generate_until |\n|bbh_fewshot_disambiguation_qa |lm_eval/tasks/bbh/fewshot/disambiguation_qa.yaml |generate_until |\n|bbh_fewshot_dyck_languages |lm_eval/tasks/bbh/fewshot/dyck_languages.yaml |generate_until |\n|bbh_fewshot_formal_fallacies |lm_eval/tasks/bbh/fewshot/formal_fallacies.yaml |generate_until |\n|bbh_fewshot_geometric_shapes |lm_eval/tasks/bbh/fewshot/geometric_shapes.yaml |generate_until |\n|bbh_fewshot_hyperbaton |lm_eval/tasks/bbh/fewshot/hyperbaton.yaml |generate_until |\n|bbh_fewshot_logical_deduction_five_objects |lm_eval/tasks/bbh/fewshot/logical_deduction_five_objects.yaml |generate_until |\n|bbh_fewshot_logical_deduction_seven_objects |lm_eval/tasks/bbh/fewshot/logical_deduction_seven_objects.yaml |generate_until |\n|bbh_fewshot_logical_deduction_three_objects |lm_eval/tasks/bbh/fewshot/logical_deduction_three_objects.yaml |generate_until |\n|bbh_fewshot_movie_recommendation |lm_eval/tasks/bbh/fewshot/movie_recommendation.yaml |generate_until |\n|bbh_fewshot_multistep_arithmetic_two |lm_eval/tasks/bbh/fewshot/multistep_arithmetic_two.yaml |generate_until |\n|bbh_fewshot_navigate |lm_eval/tasks/bbh/fewshot/navigate.yaml |generate_until |\n|bbh_fewshot_object_counting |lm_eval/tasks/bbh/fewshot/object_counting.yaml |generate_until |\n|bbh_fewshot_penguins_in_a_table |lm_eval/tasks/bbh/fewshot/penguins_in_a_table.yaml |generate_until |\n|bbh_fewshot_reasoning_about_colored_objects |lm_eval/tasks/bbh/fewshot/reasoning_about_colored_objects.yaml |generate_until |\n|bbh_fewshot_ruin_names |lm_eval/tasks/bbh/fewshot/ruin_names.yaml |generate_until |\n|bbh_fewshot_salient_translation_error_detection |lm_eval/tasks/bbh/fewshot/salient_translation_error_detection.yaml |generate_until |\n|bbh_fewshot_snarks |lm_eval/tasks/bbh/fewshot/snarks.yaml |generate_until |\n|bbh_fewshot_sports_understanding |lm_eval/tasks/bbh/fewshot/sports_understanding.yaml |generate_until |\n|bbh_fewshot_temporal_sequences |lm_eval/tasks/bbh/fewshot/temporal_sequences.yaml |generate_until |\n|bbh_fewshot_tracking_shuffled_objects_five_objects |lm_eval/tasks/bbh/fewshot/tracking_shuffled_objects_five_objects.yaml |generate_until |\n|bbh_fewshot_tracking_shuffled_objects_seven_objects |lm_eval/tasks/bbh/fewshot/tracking_shuffled_objects_seven_objects.yaml |generate_until |\n|bbh_fewshot_tracking_shuffled_objects_three_objects |lm_eval/tasks/bbh/fewshot/tracking_shuffled_objects_three_objects.yaml |generate_until |\n|bbh_fewshot_web_of_lies |lm_eval/tasks/bbh/fewshot/web_of_lies.yaml |generate_until |\n|bbh_fewshot_word_sorting |lm_eval/tasks/bbh/fewshot/word_sorting.yaml |generate_until |\n|bbh_zeroshot_boolean_expressions |lm_eval/tasks/bbh/zeroshot/boolean_expressions.yaml |generate_until |\n|bbh_zeroshot_causal_judgement |lm_eval/tasks/bbh/zeroshot/causal_judgement.yaml |generate_until |\n|bbh_zeroshot_date_understanding |lm_eval/tasks/bbh/zeroshot/date_understanding.yaml |generate_until |\n|bbh_zeroshot_disambiguation_qa |lm_eval/tasks/bbh/zeroshot/disambiguation_qa.yaml |generate_until |\n|bbh_zeroshot_dyck_languages |lm_eval/tasks/bbh/zeroshot/dyck_languages.yaml |generate_until |\n|bbh_zeroshot_formal_fallacies |lm_eval/tasks/bbh/zeroshot/formal_fallacies.yaml |generate_until |\n|bbh_zeroshot_geometric_shapes |lm_eval/tasks/bbh/zeroshot/geometric_shapes.yaml |generate_until |\n|bbh_zeroshot_hyperbaton |lm_eval/tasks/bbh/zeroshot/hyperbaton.yaml |generate_until |\n|bbh_zeroshot_logical_deduction_five_objects |lm_eval/tasks/bbh/zeroshot/logical_deduction_five_objects.yaml |generate_until |\n|bbh_zeroshot_logical_deduction_seven_objects |lm_eval/tasks/bbh/zeroshot/logical_deduction_seven_objects.yaml |generate_until |\n|bbh_zeroshot_logical_deduction_three_objects |lm_eval/tasks/bbh/zeroshot/logical_deduction_three_objects.yaml |generate_until |\n|bbh_zeroshot_movie_recommendation |lm_eval/tasks/bbh/zeroshot/movie_recommendation.yaml |generate_until |\n|bbh_zeroshot_multistep_arithmetic_two |lm_eval/tasks/bbh/zeroshot/multistep_arithmetic_two.yaml |generate_until |\n|bbh_zeroshot_navigate |lm_eval/tasks/bbh/zeroshot/navigate.yaml |generate_until |\n|bbh_zeroshot_object_counting |lm_eval/tasks/bbh/zeroshot/object_counting.yaml |generate_until |\n|bbh_zeroshot_penguins_in_a_table |lm_eval/tasks/bbh/zeroshot/penguins_in_a_table.yaml |generate_until |\n|bbh_zeroshot_reasoning_about_colored_objects |lm_eval/tasks/bbh/zeroshot/reasoning_about_colored_objects.yaml |generate_until |\n|bbh_zeroshot_ruin_names |lm_eval/tasks/bbh/zeroshot/ruin_names.yaml |generate_until |\n|bbh_zeroshot_salient_translation_error_detection |lm_eval/tasks/bbh/zeroshot/salient_translation_error_detection.yaml |generate_until |\n|bbh_zeroshot_snarks |lm_eval/tasks/bbh/zeroshot/snarks.yaml |generate_until |\n|bbh_zeroshot_sports_understanding |lm_eval/tasks/bbh/zeroshot/sports_understanding.yaml |generate_until |\n|bbh_zeroshot_temporal_sequences |lm_eval/tasks/bbh/zeroshot/temporal_sequences.yaml |generate_until |\n|bbh_zeroshot_tracking_shuffled_objects_five_objects |lm_eval/tasks/bbh/zeroshot/tracking_shuffled_objects_five_objects.yaml |generate_until |\n|bbh_zeroshot_tracking_shuffled_objects_seven_objects |lm_eval/tasks/bbh/zeroshot/tracking_shuffled_objects_seven_objects.yaml |generate_until |\n|bbh_zeroshot_tracking_shuffled_objects_three_objects |lm_eval/tasks/bbh/zeroshot/tracking_shuffled_objects_three_objects.yaml |generate_until |\n|bbh_zeroshot_web_of_lies |lm_eval/tasks/bbh/zeroshot/web_of_lies.yaml |generate_until |\n|bbh_zeroshot_word_sorting |lm_eval/tasks/bbh/zeroshot/word_sorting.yaml |generate_until |\n|bec2016eu |lm_eval/tasks/basqueglue/bec.yaml |multiple_choice |\n|belebele_acm_Arab |lm_eval/tasks/belebele/belebele_acm_Arab.yaml |multiple_choice |\n|belebele_afr_Latn |lm_eval/tasks/belebele/belebele_afr_Latn.yaml |multiple_choice |\n|belebele_als_Latn |lm_eval/tasks/belebele/belebele_als_Latn.yaml |multiple_choice |\n|belebele_amh_Ethi |lm_eval/tasks/belebele/belebele_amh_Ethi.yaml |multiple_choice |\n|belebele_apc_Arab |lm_eval/tasks/belebele/belebele_apc_Arab.yaml |multiple_choice |\n|belebele_arb_Arab |lm_eval/tasks/belebele/belebele_arb_Arab.yaml |multiple_choice |\n|belebele_arb_Latn |lm_eval/tasks/belebele/belebele_arb_Latn.yaml |multiple_choice |\n|belebele_ars_Arab |lm_eval/tasks/belebele/belebele_ars_Arab.yaml |multiple_choice |\n|belebele_ary_Arab |lm_eval/tasks/belebele/belebele_ary_Arab.yaml |multiple_choice |\n|belebele_arz_Arab |lm_eval/tasks/belebele/belebele_arz_Arab.yaml |multiple_choice |\n|belebele_asm_Beng |lm_eval/tasks/belebele/belebele_asm_Beng.yaml |multiple_choice |\n|belebele_azj_Latn |lm_eval/tasks/belebele/belebele_azj_Latn.yaml |multiple_choice |\n|belebele_bam_Latn |lm_eval/tasks/belebele/belebele_bam_Latn.yaml |multiple_choice |\n|belebele_ben_Beng |lm_eval/tasks/belebele/belebele_ben_Beng.yaml |multiple_choice |\n|belebele_ben_Latn |lm_eval/tasks/belebele/belebele_ben_Latn.yaml |multiple_choice |\n|belebele_bod_Tibt |lm_eval/tasks/belebele/belebele_bod_Tibt.yaml |multiple_choice |\n|belebele_bul_Cyrl |lm_eval/tasks/belebele/belebele_bul_Cyrl.yaml |multiple_choice |\n|belebele_cat_Latn |lm_eval/tasks/belebele/belebele_cat_Latn.yaml |multiple_choice |\n|belebele_ceb_Latn |lm_eval/tasks/belebele/belebele_ceb_Latn.yaml |multiple_choice |\n|belebele_ces_Latn |lm_eval/tasks/belebele/belebele_ces_Latn.yaml |multiple_choice |\n|belebele_ckb_Arab |lm_eval/tasks/belebele/belebele_ckb_Arab.yaml |multiple_choice |\n|belebele_dan_Latn |lm_eval/tasks/belebele/belebele_dan_Latn.yaml |multiple_choice |\n|belebele_deu_Latn |lm_eval/tasks/belebele/belebele_deu_Latn.yaml |multiple_choice |\n|belebele_ell_Grek |lm_eval/tasks/belebele/belebele_ell_Grek.yaml |multiple_choice |\n|belebele_eng_Latn |lm_eval/tasks/belebele/belebele_eng_Latn.yaml |multiple_choice |\n|belebele_est_Latn |lm_eval/tasks/belebele/belebele_est_Latn.yaml |multiple_choice |\n|belebele_eus_Latn |lm_eval/tasks/belebele/belebele_eus_Latn.yaml |multiple_choice |\n|belebele_fin_Latn |lm_eval/tasks/belebele/belebele_fin_Latn.yaml |multiple_choice |\n|belebele_fra_Latn |lm_eval/tasks/belebele/belebele_fra_Latn.yaml |multiple_choice |\n|belebele_fuv_Latn |lm_eval/tasks/belebele/belebele_fuv_Latn.yaml |multiple_choice |\n|belebele_gaz_Latn |lm_eval/tasks/belebele/belebele_gaz_Latn.yaml |multiple_choice |\n|belebele_grn_Latn |lm_eval/tasks/belebele/belebele_grn_Latn.yaml |multiple_choice |\n|belebele_guj_Gujr |lm_eval/tasks/belebele/belebele_guj_Gujr.yaml |multiple_choice |\n|belebele_hat_Latn |lm_eval/tasks/belebele/belebele_hat_Latn.yaml |multiple_choice |\n|belebele_hau_Latn |lm_eval/tasks/belebele/belebele_hau_Latn.yaml |multiple_choice |\n|belebele_heb_Hebr |lm_eval/tasks/belebele/belebele_heb_Hebr.yaml |multiple_choice |\n|belebele_hin_Deva |lm_eval/tasks/belebele/belebele_hin_Deva.yaml |multiple_choice |\n|belebele_hin_Latn |lm_eval/tasks/belebele/belebele_hin_Latn.yaml |multiple_choice |\n|belebele_hrv_Latn |lm_eval/tasks/belebele/belebele_hrv_Latn.yaml |multiple_choice |\n|belebele_hun_Latn |lm_eval/tasks/belebele/belebele_hun_Latn.yaml |multiple_choice |\n|belebele_hye_Armn |lm_eval/tasks/belebele/belebele_hye_Armn.yaml |multiple_choice |\n|belebele_ibo_Latn |lm_eval/tasks/belebele/belebele_ibo_Latn.yaml |multiple_choice |\n|belebele_ilo_Latn |lm_eval/tasks/belebele/belebele_ilo_Latn.yaml |multiple_choice |\n|belebele_ind_Latn |lm_eval/tasks/belebele/belebele_ind_Latn.yaml |multiple_choice |\n|belebele_isl_Latn |lm_eval/tasks/belebele/belebele_isl_Latn.yaml |multiple_choice |\n|belebele_ita_Latn |lm_eval/tasks/belebele/belebele_ita_Latn.yaml |multiple_choice |\n|belebele_jav_Latn |lm_eval/tasks/belebele/belebele_jav_Latn.yaml |multiple_choice |\n|belebele_jpn_Jpan |lm_eval/tasks/belebele/belebele_jpn_Jpan.yaml |multiple_choice |\n|belebele_kac_Latn |lm_eval/tasks/belebele/belebele_kac_Latn.yaml |multiple_choice |\n|belebele_kan_Knda |lm_eval/tasks/belebele/belebele_kan_Knda.yaml |multiple_choice |\n|belebele_kat_Geor |lm_eval/tasks/belebele/belebele_kat_Geor.yaml |multiple_choice |\n|belebele_kaz_Cyrl |lm_eval/tasks/belebele/belebele_kaz_Cyrl.yaml |multiple_choice |\n|belebele_kea_Latn |lm_eval/tasks/belebele/belebele_kea_Latn.yaml |multiple_choice |\n|belebele_khk_Cyrl |lm_eval/tasks/belebele/belebele_khk_Cyrl.yaml |multiple_choice |\n|belebele_khm_Khmr |lm_eval/tasks/belebele/belebele_khm_Khmr.yaml |multiple_choice |\n|belebele_kin_Latn |lm_eval/tasks/belebele/belebele_kin_Latn.yaml |multiple_choice |\n|belebele_kir_Cyrl |lm_eval/tasks/belebele/belebele_kir_Cyrl.yaml |multiple_choice |\n|belebele_kor_Hang |lm_eval/tasks/belebele/belebele_kor_Hang.yaml |multiple_choice |\n|belebele_lao_Laoo |lm_eval/tasks/belebele/belebele_lao_Laoo.yaml |multiple_choice |\n|belebele_lin_Latn |lm_eval/tasks/belebele/belebele_lin_Latn.yaml |multiple_choice |\n|belebele_lit_Latn |lm_eval/tasks/belebele/belebele_lit_Latn.yaml |multiple_choice |\n|belebele_lug_Latn |lm_eval/tasks/belebele/belebele_lug_Latn.yaml |multiple_choice |\n|belebele_luo_Latn |lm_eval/tasks/belebele/belebele_luo_Latn.yaml |multiple_choice |\n|belebele_lvs_Latn |lm_eval/tasks/belebele/belebele_lvs_Latn.yaml |multiple_choice |\n|belebele_mal_Mlym |lm_eval/tasks/belebele/belebele_mal_Mlym.yaml |multiple_choice |\n|belebele_mar_Deva |lm_eval/tasks/belebele/belebele_mar_Deva.yaml |multiple_choice |\n|belebele_mkd_Cyrl |lm_eval/tasks/belebele/belebele_mkd_Cyrl.yaml |multiple_choice |\n|belebele_mlt_Latn |lm_eval/tasks/belebele/belebele_mlt_Latn.yaml |multiple_choice |\n|belebele_mri_Latn |lm_eval/tasks/belebele/belebele_mri_Latn.yaml |multiple_choice |\n|belebele_mya_Mymr |lm_eval/tasks/belebele/belebele_mya_Mymr.yaml |multiple_choice |\n|belebele_nld_Latn |lm_eval/tasks/belebele/belebele_nld_Latn.yaml |multiple_choice |\n|belebele_nob_Latn |lm_eval/tasks/belebele/belebele_nob_Latn.yaml |multiple_choice |\n|belebele_npi_Deva |lm_eval/tasks/belebele/belebele_npi_Deva.yaml |multiple_choice |\n|belebele_npi_Latn |lm_eval/tasks/belebele/belebele_npi_Latn.yaml |multiple_choice |\n|belebele_nso_Latn |lm_eval/tasks/belebele/belebele_nso_Latn.yaml |multiple_choice |\n|belebele_nya_Latn |lm_eval/tasks/belebele/belebele_nya_Latn.yaml |multiple_choice |\n|belebele_ory_Orya |lm_eval/tasks/belebele/belebele_ory_Orya.yaml |multiple_choice |\n|belebele_pan_Guru |lm_eval/tasks/belebele/belebele_pan_Guru.yaml |multiple_choice |\n|belebele_pbt_Arab |lm_eval/tasks/belebele/belebele_pbt_Arab.yaml |multiple_choice |\n|belebele_pes_Arab |lm_eval/tasks/belebele/belebele_pes_Arab.yaml |multiple_choice |\n|belebele_plt_Latn |lm_eval/tasks/belebele/belebele_plt_Latn.yaml |multiple_choice |\n|belebele_pol_Latn |lm_eval/tasks/belebele/belebele_pol_Latn.yaml |multiple_choice |\n|belebele_por_Latn |lm_eval/tasks/belebele/belebele_por_Latn.yaml |multiple_choice |\n|belebele_ron_Latn |lm_eval/tasks/belebele/belebele_ron_Latn.yaml |multiple_choice |\n|belebele_rus_Cyrl |lm_eval/tasks/belebele/belebele_rus_Cyrl.yaml |multiple_choice |\n|belebele_shn_Mymr |lm_eval/tasks/belebele/belebele_shn_Mymr.yaml |multiple_choice |\n|belebele_sin_Latn |lm_eval/tasks/belebele/belebele_sin_Latn.yaml |multiple_choice |\n|belebele_sin_Sinh |lm_eval/tasks/belebele/belebele_sin_Sinh.yaml |multiple_choice |\n|belebele_slk_Latn |lm_eval/tasks/belebele/belebele_slk_Latn.yaml |multiple_choice |\n|belebele_slv_Latn |lm_eval/tasks/belebele/belebele_slv_Latn.yaml |multiple_choice |\n|belebele_sna_Latn |lm_eval/tasks/belebele/belebele_sna_Latn.yaml |multiple_choice |\n|belebele_snd_Arab |lm_eval/tasks/belebele/belebele_snd_Arab.yaml |multiple_choice |\n|belebele_som_Latn |lm_eval/tasks/belebele/belebele_som_Latn.yaml |multiple_choice |\n|belebele_sot_Latn |lm_eval/tasks/belebele/belebele_sot_Latn.yaml |multiple_choice |\n|belebele_spa_Latn |lm_eval/tasks/belebele/belebele_spa_Latn.yaml |multiple_choice |\n|belebele_srp_Cyrl |lm_eval/tasks/belebele/belebele_srp_Cyrl.yaml |multiple_choice |\n|belebele_ssw_Latn |lm_eval/tasks/belebele/belebele_ssw_Latn.yaml |multiple_choice |\n|belebele_sun_Latn |lm_eval/tasks/belebele/belebele_sun_Latn.yaml |multiple_choice |\n|belebele_swe_Latn |lm_eval/tasks/belebele/belebele_swe_Latn.yaml |multiple_choice |\n|belebele_swh_Latn |lm_eval/tasks/belebele/belebele_swh_Latn.yaml |multiple_choice |\n|belebele_tam_Taml |lm_eval/tasks/belebele/belebele_tam_Taml.yaml |multiple_choice |\n|belebele_tel_Telu |lm_eval/tasks/belebele/belebele_tel_Telu.yaml |multiple_choice |\n|belebele_tgk_Cyrl |lm_eval/tasks/belebele/belebele_tgk_Cyrl.yaml |multiple_choice |\n|belebele_tgl_Latn |lm_eval/tasks/belebele/belebele_tgl_Latn.yaml |multiple_choice |\n|belebele_tha_Thai |lm_eval/tasks/belebele/belebele_tha_Thai.yaml |multiple_choice |\n|belebele_tir_Ethi |lm_eval/tasks/belebele/belebele_tir_Ethi.yaml |multiple_choice |\n|belebele_tsn_Latn |lm_eval/tasks/belebele/belebele_tsn_Latn.yaml |multiple_choice |\n|belebele_tso_Latn |lm_eval/tasks/belebele/belebele_tso_Latn.yaml |multiple_choice |\n|belebele_tur_Latn |lm_eval/tasks/belebele/belebele_tur_Latn.yaml |multiple_choice |\n|belebele_ukr_Cyrl |lm_eval/tasks/belebele/belebele_ukr_Cyrl.yaml |multiple_choice |\n|belebele_urd_Arab |lm_eval/tasks/belebele/belebele_urd_Arab.yaml |multiple_choice |\n|belebele_urd_Latn |lm_eval/tasks/belebele/belebele_urd_Latn.yaml |multiple_choice |\n|belebele_uzn_Latn |lm_eval/tasks/belebele/belebele_uzn_Latn.yaml |multiple_choice |\n|belebele_vie_Latn |lm_eval/tasks/belebele/belebele_vie_Latn.yaml |multiple_choice |\n|belebele_war_Latn |lm_eval/tasks/belebele/belebele_war_Latn.yaml |multiple_choice |\n|belebele_wol_Latn |lm_eval/tasks/belebele/belebele_wol_Latn.yaml |multiple_choice |\n|belebele_xho_Latn |lm_eval/tasks/belebele/belebele_xho_Latn.yaml |multiple_choice |\n|belebele_yor_Latn |lm_eval/tasks/belebele/belebele_yor_Latn.yaml |multiple_choice |\n|belebele_zho_Hans |lm_eval/tasks/belebele/belebele_zho_Hans.yaml |multiple_choice |\n|belebele_zho_Hant |lm_eval/tasks/belebele/belebele_zho_Hant.yaml |multiple_choice |\n|belebele_zsm_Latn |lm_eval/tasks/belebele/belebele_zsm_Latn.yaml |multiple_choice |\n|belebele_zul_Latn |lm_eval/tasks/belebele/belebele_zul_Latn.yaml |multiple_choice |\n|bertaqa_en |lm_eval/tasks/bertaqa/bertaqa_en.yaml |multiple_choice |\n|bertaqa_en_mt_gemma-7b |lm_eval/tasks/bertaqa/bertaqa_en_mt_gemma-7b.yaml |multiple_choice |\n|bertaqa_en_mt_hitz |lm_eval/tasks/bertaqa/bertaqa_en_mt_hitz.yaml |multiple_choice |\n|bertaqa_en_mt_itzuli |lm_eval/tasks/bertaqa/bertaqa_en_mt_itzuli.yaml |multiple_choice |\n|bertaqa_en_mt_latxa-13b-v1 |lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-13b-v1.yaml |multiple_choice |\n|bertaqa_en_mt_latxa-13b-v1.1 |lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-13b-v1.1.yaml |multiple_choice |\n|bertaqa_en_mt_latxa-70b-v1 |lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-70b-v1.yaml |multiple_choice |\n|bertaqa_en_mt_latxa-70b-v1.1 |lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-70b-v1.1.yaml |multiple_choice |\n|bertaqa_en_mt_latxa-7b-v1 |lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-7b-v1.yaml |multiple_choice |\n|bertaqa_en_mt_latxa-7b-v1.1 |lm_eval/tasks/bertaqa/bertaqa_en_mt_latxa-7b-v1.1.yaml |multiple_choice |\n|bertaqa_en_mt_llama-2-13b |lm_eval/tasks/bertaqa/bertaqa_en_mt_llama-2-13b.yaml |multiple_choice |\n|bertaqa_en_mt_llama-2-70b |lm_eval/tasks/bertaqa/bertaqa_en_mt_llama-2-70b.yaml |multiple_choice |\n|bertaqa_en_mt_llama-2-7b |lm_eval/tasks/bertaqa/bertaqa_en_mt_llama-2-7b.yaml |multiple_choice |\n|bertaqa_en_mt_madlad |lm_eval/tasks/bertaqa/bertaqa_en_mt_madlad.yaml |multiple_choice |\n|bertaqa_en_mt_nllb |lm_eval/tasks/bertaqa/bertaqa_en_mt_nllb.yaml |multiple_choice |\n|bertaqa_eu |lm_eval/tasks/bertaqa/bertaqa_eu.yaml |multiple_choice |\n|bhtc_v2 |lm_eval/tasks/basqueglue/bhtc.yaml |multiple_choice |\n|bigbench_abstract_narrative_understanding_generate_until |lm_eval/tasks/bigbench/generate_until/abstract_narrative_understanding.yaml |generate_until |\n|bigbench_abstract_narrative_understanding_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/abstract_narrative_understanding.yaml |multiple_choice |\n|bigbench_anachronisms_generate_until |lm_eval/tasks/bigbench/generate_until/anachronisms.yaml |generate_until |\n|bigbench_anachronisms_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/anachronisms.yaml |multiple_choice |\n|bigbench_analogical_similarity_generate_until |lm_eval/tasks/bigbench/generate_until/analogical_similarity.yaml |generate_until |\n|bigbench_analogical_similarity_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/analogical_similarity.yaml |multiple_choice |\n|bigbench_analytic_entailment_generate_until |lm_eval/tasks/bigbench/generate_until/analytic_entailment.yaml |generate_until |\n|bigbench_analytic_entailment_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/analytic_entailment.yaml |multiple_choice |\n|bigbench_arithmetic_generate_until |lm_eval/tasks/bigbench/generate_until/arithmetic.yaml |generate_until |\n|bigbench_arithmetic_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/arithmetic.yaml |multiple_choice |\n|bigbench_ascii_word_recognition_generate_until |lm_eval/tasks/bigbench/generate_until/ascii_word_recognition.yaml |generate_until |\n|bigbench_authorship_verification_generate_until |lm_eval/tasks/bigbench/generate_until/authorship_verification.yaml |generate_until |\n|bigbench_authorship_verification_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/authorship_verification.yaml |multiple_choice |\n|bigbench_auto_categorization_generate_until |lm_eval/tasks/bigbench/generate_until/auto_categorization.yaml |generate_until |\n|bigbench_auto_debugging_generate_until |lm_eval/tasks/bigbench/generate_until/auto_debugging.yaml |generate_until |\n|bigbench_bbq_lite_json_generate_until |lm_eval/tasks/bigbench/generate_until/bbq_lite_json.yaml |generate_until |\n|bigbench_bbq_lite_json_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/bbq_lite_json.yaml |multiple_choice |\n|bigbench_bridging_anaphora_resolution_barqa_generate_until |lm_eval/tasks/bigbench/generate_until/bridging_anaphora_resolution_barqa.yaml |generate_until |\n|bigbench_causal_judgment_generate_until |lm_eval/tasks/bigbench/generate_until/causal_judgment.yaml |generate_until |\n|bigbench_causal_judgment_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/causal_judgment.yaml |multiple_choice |\n|bigbench_cause_and_effect_generate_until |lm_eval/tasks/bigbench/generate_until/cause_and_effect.yaml |generate_until |\n|bigbench_cause_and_effect_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/cause_and_effect.yaml |multiple_choice |\n|bigbench_checkmate_in_one_generate_until |lm_eval/tasks/bigbench/generate_until/checkmate_in_one.yaml |generate_until |\n|bigbench_checkmate_in_one_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/checkmate_in_one.yaml |multiple_choice |\n|bigbench_chess_state_tracking_generate_until |lm_eval/tasks/bigbench/generate_until/chess_state_tracking.yaml |generate_until |\n|bigbench_chinese_remainder_theorem_generate_until |lm_eval/tasks/bigbench/generate_until/chinese_remainder_theorem.yaml |generate_until |\n|bigbench_cifar10_classification_generate_until |lm_eval/tasks/bigbench/generate_until/cifar10_classification.yaml |generate_until |\n|bigbench_cifar10_classification_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/cifar10_classification.yaml |multiple_choice |\n|bigbench_code_line_description_generate_until |lm_eval/tasks/bigbench/generate_until/code_line_description.yaml |generate_until |\n|bigbench_code_line_description_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/code_line_description.yaml |multiple_choice |\n|bigbench_codenames_generate_until |lm_eval/tasks/bigbench/generate_until/codenames.yaml |generate_until |\n|bigbench_color_generate_until |lm_eval/tasks/bigbench/generate_until/color.yaml |generate_until |\n|bigbench_color_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/color.yaml |multiple_choice |\n|bigbench_common_morpheme_generate_until |lm_eval/tasks/bigbench/generate_until/common_morpheme.yaml |generate_until |\n|bigbench_common_morpheme_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/common_morpheme.yaml |multiple_choice |\n|bigbench_conceptual_combinations_generate_until |lm_eval/tasks/bigbench/generate_until/conceptual_combinations.yaml |generate_until |\n|bigbench_conceptual_combinations_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/conceptual_combinations.yaml |multiple_choice |\n|bigbench_conlang_translation_generate_until |lm_eval/tasks/bigbench/generate_until/conlang_translation.yaml |generate_until |\n|bigbench_contextual_parametric_knowledge_conflicts_generate_until |lm_eval/tasks/bigbench/generate_until/contextual_parametric_knowledge_conflicts.yaml |generate_until |\n|bigbench_contextual_parametric_knowledge_conflicts_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/contextual_parametric_knowledge_conflicts.yaml |multiple_choice |\n|bigbench_crash_blossom_generate_until |lm_eval/tasks/bigbench/generate_until/crash_blossom.yaml |generate_until |\n|bigbench_crash_blossom_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/crash_blossom.yaml |multiple_choice |\n|bigbench_crass_ai_generate_until |lm_eval/tasks/bigbench/generate_until/crass_ai.yaml |generate_until |\n|bigbench_crass_ai_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/crass_ai.yaml |multiple_choice |\n|bigbench_cryobiology_spanish_generate_until |lm_eval/tasks/bigbench/generate_until/cryobiology_spanish.yaml |generate_until |\n|bigbench_cryobiology_spanish_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/cryobiology_spanish.yaml |multiple_choice |\n|bigbench_cryptonite_generate_until |lm_eval/tasks/bigbench/generate_until/cryptonite.yaml |generate_until |\n|bigbench_cs_algorithms_generate_until |lm_eval/tasks/bigbench/generate_until/cs_algorithms.yaml |generate_until |\n|bigbench_cs_algorithms_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/cs_algorithms.yaml |multiple_choice |\n|bigbench_dark_humor_detection_generate_until |lm_eval/tasks/bigbench/generate_until/dark_humor_detection.yaml |generate_until |\n|bigbench_dark_humor_detection_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/dark_humor_detection.yaml |multiple_choice |\n|bigbench_date_understanding_generate_until |lm_eval/tasks/bigbench/generate_until/date_understanding.yaml |generate_until |\n|bigbench_date_understanding_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/date_understanding.yaml |multiple_choice |\n|bigbench_disambiguation_qa_generate_until |lm_eval/tasks/bigbench/generate_until/disambiguation_qa.yaml |generate_until |\n|bigbench_disambiguation_qa_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/disambiguation_qa.yaml |multiple_choice |\n|bigbench_discourse_marker_prediction_generate_until |lm_eval/tasks/bigbench/generate_until/discourse_marker_prediction.yaml |generate_until |\n|bigbench_discourse_marker_prediction_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/discourse_marker_prediction.yaml |multiple_choice |\n|bigbench_disfl_qa_generate_until |lm_eval/tasks/bigbench/generate_until/disfl_qa.yaml |generate_until |\n|bigbench_dyck_languages_generate_until |lm_eval/tasks/bigbench/generate_until/dyck_languages.yaml |generate_until |\n|bigbench_dyck_languages_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/dyck_languages.yaml |multiple_choice |\n|bigbench_elementary_math_qa_generate_until |lm_eval/tasks/bigbench/generate_until/elementary_math_qa.yaml |generate_until |\n|bigbench_elementary_math_qa_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/elementary_math_qa.yaml |multiple_choice |\n|bigbench_emoji_movie_generate_until |lm_eval/tasks/bigbench/generate_until/emoji_movie.yaml |generate_until |\n|bigbench_emoji_movie_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/emoji_movie.yaml |multiple_choice |\n|bigbench_emojis_emotion_prediction_generate_until |lm_eval/tasks/bigbench/generate_until/emojis_emotion_prediction.yaml |generate_until |\n|bigbench_emojis_emotion_prediction_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/emojis_emotion_prediction.yaml |multiple_choice |\n|bigbench_empirical_judgments_generate_until |lm_eval/tasks/bigbench/generate_until/empirical_judgments.yaml |generate_until |\n|bigbench_empirical_judgments_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/empirical_judgments.yaml |multiple_choice |\n|bigbench_english_proverbs_generate_until |lm_eval/tasks/bigbench/generate_until/english_proverbs.yaml |generate_until |\n|bigbench_english_proverbs_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/english_proverbs.yaml |multiple_choice |\n|bigbench_english_russian_proverbs_generate_until |lm_eval/tasks/bigbench/generate_until/english_russian_proverbs.yaml |generate_until |\n|bigbench_english_russian_proverbs_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/english_russian_proverbs.yaml |multiple_choice |\n|bigbench_entailed_polarity_generate_until |lm_eval/tasks/bigbench/generate_until/entailed_polarity.yaml |generate_until |\n|bigbench_entailed_polarity_hindi_generate_until |lm_eval/tasks/bigbench/generate_until/entailed_polarity_hindi.yaml |generate_until |\n|bigbench_entailed_polarity_hindi_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/entailed_polarity_hindi.yaml |multiple_choice |\n|bigbench_entailed_polarity_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/entailed_polarity.yaml |multiple_choice |\n|bigbench_epistemic_reasoning_generate_until |lm_eval/tasks/bigbench/generate_until/epistemic_reasoning.yaml |generate_until |\n|bigbench_epistemic_reasoning_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/epistemic_reasoning.yaml |multiple_choice |\n|bigbench_evaluating_information_essentiality_generate_until |lm_eval/tasks/bigbench/generate_until/evaluating_information_essentiality.yaml |generate_until |\n|bigbench_evaluating_information_essentiality_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/evaluating_information_essentiality.yaml |multiple_choice |\n|bigbench_fact_checker_generate_until |lm_eval/tasks/bigbench/generate_until/fact_checker.yaml |generate_until |\n|bigbench_fact_checker_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/fact_checker.yaml |multiple_choice |\n|bigbench_fantasy_reasoning_generate_until |lm_eval/tasks/bigbench/generate_until/fantasy_reasoning.yaml |generate_until |\n|bigbench_fantasy_reasoning_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/fantasy_reasoning.yaml |multiple_choice |\n|bigbench_few_shot_nlg_generate_until |lm_eval/tasks/bigbench/generate_until/few_shot_nlg.yaml |generate_until |\n|bigbench_figure_of_speech_detection_generate_until |lm_eval/tasks/bigbench/generate_until/figure_of_speech_detection.yaml |generate_until |\n|bigbench_figure_of_speech_detection_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/figure_of_speech_detection.yaml |multiple_choice |\n|bigbench_formal_fallacies_syllogisms_negation_generate_until |lm_eval/tasks/bigbench/generate_until/formal_fallacies_syllogisms_negation.yaml |generate_until |\n|bigbench_formal_fallacies_syllogisms_negation_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/formal_fallacies_syllogisms_negation.yaml |multiple_choice |\n|bigbench_gem_generate_until |lm_eval/tasks/bigbench/generate_until/gem.yaml |generate_until |\n|bigbench_gender_inclusive_sentences_german_generate_until |lm_eval/tasks/bigbench/generate_until/gender_inclusive_sentences_german.yaml |generate_until |\n|bigbench_general_knowledge_generate_until |lm_eval/tasks/bigbench/generate_until/general_knowledge.yaml |generate_until |\n|bigbench_general_knowledge_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/general_knowledge.yaml |multiple_choice |\n|bigbench_geometric_shapes_generate_until |lm_eval/tasks/bigbench/generate_until/geometric_shapes.yaml |generate_until |\n|bigbench_geometric_shapes_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/geometric_shapes.yaml |multiple_choice |\n|bigbench_goal_step_wikihow_generate_until |lm_eval/tasks/bigbench/generate_until/goal_step_wikihow.yaml |generate_until |\n|bigbench_goal_step_wikihow_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/goal_step_wikihow.yaml |multiple_choice |\n|bigbench_gre_reading_comprehension_generate_until |lm_eval/tasks/bigbench/generate_until/gre_reading_comprehension.yaml |generate_until |\n|bigbench_gre_reading_comprehension_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/gre_reading_comprehension.yaml |multiple_choice |\n|bigbench_hhh_alignment_generate_until |lm_eval/tasks/bigbench/generate_until/hhh_alignment.yaml |generate_until |\n|bigbench_hhh_alignment_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/hhh_alignment.yaml |multiple_choice |\n|bigbench_hindi_question_answering_generate_until |lm_eval/tasks/bigbench/generate_until/hindi_question_answering.yaml |generate_until |\n|bigbench_hindu_knowledge_generate_until |lm_eval/tasks/bigbench/generate_until/hindu_knowledge.yaml |generate_until |\n|bigbench_hindu_knowledge_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/hindu_knowledge.yaml |multiple_choice |\n|bigbench_hinglish_toxicity_generate_until |lm_eval/tasks/bigbench/generate_until/hinglish_toxicity.yaml |generate_until |\n|bigbench_hinglish_toxicity_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/hinglish_toxicity.yaml |multiple_choice |\n|bigbench_human_organs_senses_generate_until |lm_eval/tasks/bigbench/generate_until/human_organs_senses.yaml |generate_until |\n|bigbench_human_organs_senses_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/human_organs_senses.yaml |multiple_choice |\n|bigbench_hyperbaton_generate_until |lm_eval/tasks/bigbench/generate_until/hyperbaton.yaml |generate_until |\n|bigbench_hyperbaton_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/hyperbaton.yaml |multiple_choice |\n|bigbench_identify_math_theorems_generate_until |lm_eval/tasks/bigbench/generate_until/identify_math_theorems.yaml |generate_until |\n|bigbench_identify_math_theorems_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/identify_math_theorems.yaml |multiple_choice |\n|bigbench_identify_odd_metaphor_generate_until |lm_eval/tasks/bigbench/generate_until/identify_odd_metaphor.yaml |generate_until |\n|bigbench_identify_odd_metaphor_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/identify_odd_metaphor.yaml |multiple_choice |\n|bigbench_implicatures_generate_until |lm_eval/tasks/bigbench/generate_until/implicatures.yaml |generate_until |\n|bigbench_implicatures_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/implicatures.yaml |multiple_choice |\n|bigbench_implicit_relations_generate_until |lm_eval/tasks/bigbench/generate_until/implicit_relations.yaml |generate_until |\n|bigbench_implicit_relations_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/implicit_relations.yaml |multiple_choice |\n|bigbench_intent_recognition_generate_until |lm_eval/tasks/bigbench/generate_until/intent_recognition.yaml |generate_until |\n|bigbench_intent_recognition_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/intent_recognition.yaml |multiple_choice |\n|bigbench_international_phonetic_alphabet_nli_generate_until |lm_eval/tasks/bigbench/generate_until/international_phonetic_alphabet_nli.yaml |generate_until |\n|bigbench_international_phonetic_alphabet_nli_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/international_phonetic_alphabet_nli.yaml |multiple_choice |\n|bigbench_international_phonetic_alphabet_transliterate_generate_until |lm_eval/tasks/bigbench/generate_until/international_phonetic_alphabet_transliterate.yaml |generate_until |\n|bigbench_intersect_geometry_generate_until |lm_eval/tasks/bigbench/generate_until/intersect_geometry.yaml |generate_until |\n|bigbench_intersect_geometry_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/intersect_geometry.yaml |multiple_choice |\n|bigbench_irony_identification_generate_until |lm_eval/tasks/bigbench/generate_until/irony_identification.yaml |generate_until |\n|bigbench_irony_identification_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/irony_identification.yaml |multiple_choice |\n|bigbench_kanji_ascii_generate_until |lm_eval/tasks/bigbench/generate_until/kanji_ascii.yaml |generate_until |\n|bigbench_kanji_ascii_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/kanji_ascii.yaml |multiple_choice |\n|bigbench_kannada_generate_until |lm_eval/tasks/bigbench/generate_until/kannada.yaml |generate_until |\n|bigbench_kannada_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/kannada.yaml |multiple_choice |\n|bigbench_key_value_maps_generate_until |lm_eval/tasks/bigbench/generate_until/key_value_maps.yaml |generate_until |\n|bigbench_key_value_maps_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/key_value_maps.yaml |multiple_choice |\n|bigbench_known_unknowns_generate_until |lm_eval/tasks/bigbench/generate_until/known_unknowns.yaml |generate_until |\n|bigbench_known_unknowns_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/known_unknowns.yaml |multiple_choice |\n|bigbench_language_games_generate_until |lm_eval/tasks/bigbench/generate_until/language_games.yaml |generate_until |\n|bigbench_language_identification_generate_until |lm_eval/tasks/bigbench/generate_until/language_identification.yaml |generate_until |\n|bigbench_language_identification_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/language_identification.yaml |multiple_choice |\n|bigbench_linguistic_mappings_generate_until |lm_eval/tasks/bigbench/generate_until/linguistic_mappings.yaml |generate_until |\n|bigbench_linguistics_puzzles_generate_until |lm_eval/tasks/bigbench/generate_until/linguistics_puzzles.yaml |generate_until |\n|bigbench_list_functions_generate_until |lm_eval/tasks/bigbench/generate_until/list_functions.yaml |generate_until |\n|bigbench_logic_grid_puzzle_generate_until |lm_eval/tasks/bigbench/generate_until/logic_grid_puzzle.yaml |generate_until |\n|bigbench_logic_grid_puzzle_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/logic_grid_puzzle.yaml |multiple_choice |\n|bigbench_logical_args_generate_until |lm_eval/tasks/bigbench/generate_until/logical_args.yaml |generate_until |\n|bigbench_logical_args_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/logical_args.yaml |multiple_choice |\n|bigbench_logical_deduction_generate_until |lm_eval/tasks/bigbench/generate_until/logical_deduction.yaml |generate_until |\n|bigbench_logical_deduction_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/logical_deduction.yaml |multiple_choice |\n|bigbench_logical_fallacy_detection_generate_until |lm_eval/tasks/bigbench/generate_until/logical_fallacy_detection.yaml |generate_until |\n|bigbench_logical_fallacy_detection_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/logical_fallacy_detection.yaml |multiple_choice |\n|bigbench_logical_sequence_generate_until |lm_eval/tasks/bigbench/generate_until/logical_sequence.yaml |generate_until |\n|bigbench_logical_sequence_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/logical_sequence.yaml |multiple_choice |\n|bigbench_mathematical_induction_generate_until |lm_eval/tasks/bigbench/generate_until/mathematical_induction.yaml |generate_until |\n|bigbench_mathematical_induction_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/mathematical_induction.yaml |multiple_choice |\n|bigbench_matrixshapes_generate_until |lm_eval/tasks/bigbench/generate_until/matrixshapes.yaml |generate_until |\n|bigbench_metaphor_boolean_generate_until |lm_eval/tasks/bigbench/generate_until/metaphor_boolean.yaml |generate_until |\n|bigbench_metaphor_boolean_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/metaphor_boolean.yaml |multiple_choice |\n|bigbench_metaphor_understanding_generate_until |lm_eval/tasks/bigbench/generate_until/metaphor_understanding.yaml |generate_until |\n|bigbench_metaphor_understanding_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/metaphor_understanding.yaml |multiple_choice |\n|bigbench_minute_mysteries_qa_generate_until |lm_eval/tasks/bigbench/generate_until/minute_mysteries_qa.yaml |generate_until |\n|bigbench_misconceptions_generate_until |lm_eval/tasks/bigbench/generate_until/misconceptions.yaml |generate_until |\n|bigbench_misconceptions_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/misconceptions.yaml |multiple_choice |\n|bigbench_misconceptions_russian_generate_until |lm_eval/tasks/bigbench/generate_until/misconceptions_russian.yaml |generate_until |\n|bigbench_misconceptions_russian_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/misconceptions_russian.yaml |multiple_choice |\n|bigbench_mnist_ascii_generate_until |lm_eval/tasks/bigbench/generate_until/mnist_ascii.yaml |generate_until |\n|bigbench_mnist_ascii_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/mnist_ascii.yaml |multiple_choice |\n|bigbench_modified_arithmetic_generate_until |lm_eval/tasks/bigbench/generate_until/modified_arithmetic.yaml |generate_until |\n|bigbench_moral_permissibility_generate_until |lm_eval/tasks/bigbench/generate_until/moral_permissibility.yaml |generate_until |\n|bigbench_moral_permissibility_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/moral_permissibility.yaml |multiple_choice |\n|bigbench_movie_dialog_same_or_different_generate_until |lm_eval/tasks/bigbench/generate_until/movie_dialog_same_or_different.yaml |generate_until |\n|bigbench_movie_dialog_same_or_different_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/movie_dialog_same_or_different.yaml |multiple_choice |\n|bigbench_movie_recommendation_generate_until |lm_eval/tasks/bigbench/generate_until/movie_recommendation.yaml |generate_until |\n|bigbench_movie_recommendation_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/movie_recommendation.yaml |multiple_choice |\n|bigbench_mult_data_wrangling_generate_until |lm_eval/tasks/bigbench/generate_until/mult_data_wrangling.yaml |generate_until |\n|bigbench_multiemo_generate_until |lm_eval/tasks/bigbench/generate_until/multiemo.yaml |generate_until |\n|bigbench_multiemo_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/multiemo.yaml |multiple_choice |\n|bigbench_natural_instructions_generate_until |lm_eval/tasks/bigbench/generate_until/natural_instructions.yaml |generate_until |\n|bigbench_navigate_generate_until |lm_eval/tasks/bigbench/generate_until/navigate.yaml |generate_until |\n|bigbench_navigate_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/navigate.yaml |multiple_choice |\n|bigbench_nonsense_words_grammar_generate_until |lm_eval/tasks/bigbench/generate_until/nonsense_words_grammar.yaml |generate_until |\n|bigbench_nonsense_words_grammar_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/nonsense_words_grammar.yaml |multiple_choice |\n|bigbench_novel_concepts_generate_until |lm_eval/tasks/bigbench/generate_until/novel_concepts.yaml |generate_until |\n|bigbench_novel_concepts_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/novel_concepts.yaml |multiple_choice |\n|bigbench_object_counting_generate_until |lm_eval/tasks/bigbench/generate_until/object_counting.yaml |generate_until |\n|bigbench_odd_one_out_generate_until |lm_eval/tasks/bigbench/generate_until/odd_one_out.yaml |generate_until |\n|bigbench_odd_one_out_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/odd_one_out.yaml |multiple_choice |\n|bigbench_operators_generate_until |lm_eval/tasks/bigbench/generate_until/operators.yaml |generate_until |\n|bigbench_paragraph_segmentation_generate_until |lm_eval/tasks/bigbench/generate_until/paragraph_segmentation.yaml |generate_until |\n|bigbench_parsinlu_qa_generate_until |lm_eval/tasks/bigbench/generate_until/parsinlu_qa.yaml |generate_until |\n|bigbench_parsinlu_qa_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/parsinlu_qa.yaml |multiple_choice |\n|bigbench_parsinlu_reading_comprehension_generate_until |lm_eval/tasks/bigbench/generate_until/parsinlu_reading_comprehension.yaml |generate_until |\n|bigbench_penguins_in_a_table_generate_until |lm_eval/tasks/bigbench/generate_until/penguins_in_a_table.yaml |generate_until |\n|bigbench_penguins_in_a_table_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/penguins_in_a_table.yaml |multiple_choice |\n|bigbench_periodic_elements_generate_until |lm_eval/tasks/bigbench/generate_until/periodic_elements.yaml |generate_until |\n|bigbench_periodic_elements_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/periodic_elements.yaml |multiple_choice |\n|bigbench_persian_idioms_generate_until |lm_eval/tasks/bigbench/generate_until/persian_idioms.yaml |generate_until |\n|bigbench_persian_idioms_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/persian_idioms.yaml |multiple_choice |\n|bigbench_phrase_relatedness_generate_until |lm_eval/tasks/bigbench/generate_until/phrase_relatedness.yaml |generate_until |\n|bigbench_phrase_relatedness_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/phrase_relatedness.yaml |multiple_choice |\n|bigbench_physical_intuition_generate_until |lm_eval/tasks/bigbench/generate_until/physical_intuition.yaml |generate_until |\n|bigbench_physical_intuition_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/physical_intuition.yaml |multiple_choice |\n|bigbench_physics_generate_until |lm_eval/tasks/bigbench/generate_until/physics.yaml |generate_until |\n|bigbench_physics_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/physics.yaml |multiple_choice |\n|bigbench_physics_questions_generate_until |lm_eval/tasks/bigbench/generate_until/physics_questions.yaml |generate_until |\n|bigbench_play_dialog_same_or_different_generate_until |lm_eval/tasks/bigbench/generate_until/play_dialog_same_or_different.yaml |generate_until |\n|bigbench_play_dialog_same_or_different_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/play_dialog_same_or_different.yaml |multiple_choice |\n|bigbench_polish_sequence_labeling_generate_until |lm_eval/tasks/bigbench/generate_until/polish_sequence_labeling.yaml |generate_until |\n|bigbench_presuppositions_as_nli_generate_until |lm_eval/tasks/bigbench/generate_until/presuppositions_as_nli.yaml |generate_until |\n|bigbench_presuppositions_as_nli_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/presuppositions_as_nli.yaml |multiple_choice |\n|bigbench_qa_wikidata_generate_until |lm_eval/tasks/bigbench/generate_until/qa_wikidata.yaml |generate_until |\n|bigbench_question_selection_generate_until |lm_eval/tasks/bigbench/generate_until/question_selection.yaml |generate_until |\n|bigbench_question_selection_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/question_selection.yaml |multiple_choice |\n|bigbench_real_or_fake_text_generate_until |lm_eval/tasks/bigbench/generate_until/real_or_fake_text.yaml |generate_until |\n|bigbench_real_or_fake_text_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/real_or_fake_text.yaml |multiple_choice |\n|bigbench_reasoning_about_colored_objects_generate_until |lm_eval/tasks/bigbench/generate_until/reasoning_about_colored_objects.yaml |generate_until |\n|bigbench_reasoning_about_colored_objects_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/reasoning_about_colored_objects.yaml |multiple_choice |\n|bigbench_repeat_copy_logic_generate_until |lm_eval/tasks/bigbench/generate_until/repeat_copy_logic.yaml |generate_until |\n|bigbench_rephrase_generate_until |lm_eval/tasks/bigbench/generate_until/rephrase.yaml |generate_until |\n|bigbench_riddle_sense_generate_until |lm_eval/tasks/bigbench/generate_until/riddle_sense.yaml |generate_until |\n|bigbench_riddle_sense_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/riddle_sense.yaml |multiple_choice |\n|bigbench_ruin_names_generate_until |lm_eval/tasks/bigbench/generate_until/ruin_names.yaml |generate_until |\n|bigbench_ruin_names_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/ruin_names.yaml |multiple_choice |\n|bigbench_salient_translation_error_detection_generate_until |lm_eval/tasks/bigbench/generate_until/salient_translation_error_detection.yaml |generate_until |\n|bigbench_salient_translation_error_detection_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/salient_translation_error_detection.yaml |multiple_choice |\n|bigbench_scientific_press_release_generate_until |lm_eval/tasks/bigbench/generate_until/scientific_press_release.yaml |generate_until |\n|bigbench_semantic_parsing_in_context_sparc_generate_until |lm_eval/tasks/bigbench/generate_until/semantic_parsing_in_context_sparc.yaml |generate_until |\n|bigbench_semantic_parsing_spider_generate_until |lm_eval/tasks/bigbench/generate_until/semantic_parsing_spider.yaml |generate_until |\n|bigbench_sentence_ambiguity_generate_until |lm_eval/tasks/bigbench/generate_until/sentence_ambiguity.yaml |generate_until |\n|bigbench_sentence_ambiguity_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/sentence_ambiguity.yaml |multiple_choice |\n|bigbench_similarities_abstraction_generate_until |lm_eval/tasks/bigbench/generate_until/similarities_abstraction.yaml |generate_until |\n|bigbench_similarities_abstraction_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/similarities_abstraction.yaml |multiple_choice |\n|bigbench_simp_turing_concept_generate_until |lm_eval/tasks/bigbench/generate_until/simp_turing_concept.yaml |generate_until |\n|bigbench_simple_arithmetic_json_generate_until |lm_eval/tasks/bigbench/generate_until/simple_arithmetic_json.yaml |generate_until |\n|bigbench_simple_arithmetic_json_multiple_choice_generate_until |lm_eval/tasks/bigbench/generate_until/simple_arithmetic_json_multiple_choice.yaml |generate_until |\n|bigbench_simple_arithmetic_json_subtasks_generate_until |lm_eval/tasks/bigbench/generate_until/simple_arithmetic_json_subtasks.yaml |generate_until |\n|bigbench_simple_arithmetic_multiple_targets_json_generate_until |lm_eval/tasks/bigbench/generate_until/simple_arithmetic_multiple_targets_json.yaml |generate_until |\n|bigbench_simple_ethical_questions_generate_until |lm_eval/tasks/bigbench/generate_until/simple_ethical_questions.yaml |generate_until |\n|bigbench_simple_ethical_questions_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/simple_ethical_questions.yaml |multiple_choice |\n|bigbench_simple_text_editing_generate_until |lm_eval/tasks/bigbench/generate_until/simple_text_editing.yaml |generate_until |\n|bigbench_snarks_generate_until |lm_eval/tasks/bigbench/generate_until/snarks.yaml |generate_until |\n|bigbench_snarks_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/snarks.yaml |multiple_choice |\n|bigbench_social_iqa_generate_until |lm_eval/tasks/bigbench/generate_until/social_iqa.yaml |generate_until |\n|bigbench_social_iqa_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/social_iqa.yaml |multiple_choice |\n|bigbench_social_support_generate_until |lm_eval/tasks/bigbench/generate_until/social_support.yaml |generate_until |\n|bigbench_social_support_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/social_support.yaml |multiple_choice |\n|bigbench_sports_understanding_generate_until |lm_eval/tasks/bigbench/generate_until/sports_understanding.yaml |generate_until |\n|bigbench_sports_understanding_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/sports_understanding.yaml |multiple_choice |\n|bigbench_strange_stories_generate_until |lm_eval/tasks/bigbench/generate_until/strange_stories.yaml |generate_until |\n|bigbench_strange_stories_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/strange_stories.yaml |multiple_choice |\n|bigbench_strategyqa_generate_until |lm_eval/tasks/bigbench/generate_until/strategyqa.yaml |generate_until |\n|bigbench_strategyqa_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/strategyqa.yaml |multiple_choice |\n|bigbench_sufficient_information_generate_until |lm_eval/tasks/bigbench/generate_until/sufficient_information.yaml |generate_until |\n|bigbench_suicide_risk_generate_until |lm_eval/tasks/bigbench/generate_until/suicide_risk.yaml |generate_until |\n|bigbench_suicide_risk_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/suicide_risk.yaml |multiple_choice |\n|bigbench_swahili_english_proverbs_generate_until |lm_eval/tasks/bigbench/generate_until/swahili_english_proverbs.yaml |generate_until |\n|bigbench_swahili_english_proverbs_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/swahili_english_proverbs.yaml |multiple_choice |\n|bigbench_swedish_to_german_proverbs_generate_until |lm_eval/tasks/bigbench/generate_until/swedish_to_german_proverbs.yaml |generate_until |\n|bigbench_swedish_to_german_proverbs_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/swedish_to_german_proverbs.yaml |multiple_choice |\n|bigbench_symbol_interpretation_generate_until |lm_eval/tasks/bigbench/generate_until/symbol_interpretation.yaml |generate_until |\n|bigbench_symbol_interpretation_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/symbol_interpretation.yaml |multiple_choice |\n|bigbench_temporal_sequences_generate_until |lm_eval/tasks/bigbench/generate_until/temporal_sequences.yaml |generate_until |\n|bigbench_temporal_sequences_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/temporal_sequences.yaml |multiple_choice |\n|bigbench_tense_generate_until |lm_eval/tasks/bigbench/generate_until/tense.yaml |generate_until |\n|bigbench_timedial_generate_until |lm_eval/tasks/bigbench/generate_until/timedial.yaml |generate_until |\n|bigbench_timedial_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/timedial.yaml |multiple_choice |\n|bigbench_topical_chat_generate_until |lm_eval/tasks/bigbench/generate_until/topical_chat.yaml |generate_until |\n|bigbench_tracking_shuffled_objects_generate_until |lm_eval/tasks/bigbench/generate_until/tracking_shuffled_objects.yaml |generate_until |\n|bigbench_tracking_shuffled_objects_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/tracking_shuffled_objects.yaml |multiple_choice |\n|bigbench_understanding_fables_generate_until |lm_eval/tasks/bigbench/generate_until/understanding_fables.yaml |generate_until |\n|bigbench_understanding_fables_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/understanding_fables.yaml |multiple_choice |\n|bigbench_undo_permutation_generate_until |lm_eval/tasks/bigbench/generate_until/undo_permutation.yaml |generate_until |\n|bigbench_undo_permutation_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/undo_permutation.yaml |multiple_choice |\n|bigbench_unit_conversion_generate_until |lm_eval/tasks/bigbench/generate_until/unit_conversion.yaml |generate_until |\n|bigbench_unit_conversion_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/unit_conversion.yaml |multiple_choice |\n|bigbench_unit_interpretation_generate_until |lm_eval/tasks/bigbench/generate_until/unit_interpretation.yaml |generate_until |\n|bigbench_unit_interpretation_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/unit_interpretation.yaml |multiple_choice |\n|bigbench_unnatural_in_context_learning_generate_until |lm_eval/tasks/bigbench/generate_until/unnatural_in_context_learning.yaml |generate_until |\n|bigbench_vitaminc_fact_verification_generate_until |lm_eval/tasks/bigbench/generate_until/vitaminc_fact_verification.yaml |generate_until |\n|bigbench_vitaminc_fact_verification_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/vitaminc_fact_verification.yaml |multiple_choice |\n|bigbench_what_is_the_tao_generate_until |lm_eval/tasks/bigbench/generate_until/what_is_the_tao.yaml |generate_until |\n|bigbench_what_is_the_tao_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/what_is_the_tao.yaml |multiple_choice |\n|bigbench_which_wiki_edit_generate_until |lm_eval/tasks/bigbench/generate_until/which_wiki_edit.yaml |generate_until |\n|bigbench_which_wiki_edit_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/which_wiki_edit.yaml |multiple_choice |\n|bigbench_winowhy_generate_until |lm_eval/tasks/bigbench/generate_until/winowhy.yaml |generate_until |\n|bigbench_winowhy_multiple_choice |lm_eval/tasks/bigbench/multiple_choice/winowhy.yaml |multiple_choice |\n|bigbench_word_sorting_generate_until |lm_eval/tasks/bigbench/generate_until/word_sorting.yaml |generate_until |\n|bigbench_word_unscrambling_generate_until |lm_eval/tasks/bigbench/generate_until/word_unscrambling.yaml |generate_until |\n|blimp_adjunct_island |lm_eval/tasks/blimp/adjunct_island.yaml |multiple_choice |\n|blimp_anaphor_gender_agreement |lm_eval/tasks/blimp/anaphor_gender_agreement.yaml |multiple_choice |\n|blimp_anaphor_number_agreement |lm_eval/tasks/blimp/anaphor_number_agreement.yaml |multiple_choice |\n|blimp_animate_subject_passive |lm_eval/tasks/blimp/animate_subject_passive.yaml |multiple_choice |\n|blimp_animate_subject_trans |lm_eval/tasks/blimp/animate_subject_trans.yaml |multiple_choice |\n|blimp_causative |lm_eval/tasks/blimp/causative.yaml |multiple_choice |\n|blimp_complex_NP_island |lm_eval/tasks/blimp/complex_NP_island.yaml |multiple_choice |\n|blimp_coordinate_structure_constraint_complex_left_branch |lm_eval/tasks/blimp/coordinate_structure_constraint_complex_left_branch.yaml |multiple_choice |\n|blimp_coordinate_structure_constraint_object_extraction |lm_eval/tasks/blimp/coordinate_structure_constraint_object_extraction.yaml |multiple_choice |\n|blimp_determiner_noun_agreement_1 |lm_eval/tasks/blimp/determiner_noun_agreement_1.yaml |multiple_choice |\n|blimp_determiner_noun_agreement_2 |lm_eval/tasks/blimp/determiner_noun_agreement_2.yaml |multiple_choice |\n|blimp_determiner_noun_agreement_irregular_1 |lm_eval/tasks/blimp/determiner_noun_agreement_irregular_1.yaml |multiple_choice |\n|blimp_determiner_noun_agreement_irregular_2 |lm_eval/tasks/blimp/determiner_noun_agreement_irregular_2.yaml |multiple_choice |\n|blimp_determiner_noun_agreement_with_adj_2 |lm_eval/tasks/blimp/determiner_noun_agreement_with_adj_2.yaml |multiple_choice |\n|blimp_determiner_noun_agreement_with_adj_irregular_1 |lm_eval/tasks/blimp/determiner_noun_agreement_with_adj_irregular_1.yaml |multiple_choice |\n|blimp_determiner_noun_agreement_with_adj_irregular_2 |lm_eval/tasks/blimp/determiner_noun_agreement_with_adj_irregular_2.yaml |multiple_choice |\n|blimp_determiner_noun_agreement_with_adjective_1 |lm_eval/tasks/blimp/determiner_noun_agreement_with_adjective_1.yaml |multiple_choice |\n|blimp_distractor_agreement_relational_noun |lm_eval/tasks/blimp/distractor_agreement_relational_noun.yaml |multiple_choice |\n|blimp_distractor_agreement_relative_clause |lm_eval/tasks/blimp/distractor_agreement_relative_clause.yaml |multiple_choice |\n|blimp_drop_argument |lm_eval/tasks/blimp/drop_argument.yaml |multiple_choice |\n|blimp_ellipsis_n_bar_1 |lm_eval/tasks/blimp/ellipsis_n_bar_1.yaml |multiple_choice |\n|blimp_ellipsis_n_bar_2 |lm_eval/tasks/blimp/ellipsis_n_bar_2.yaml |multiple_choice |\n|blimp_existential_there_object_raising |lm_eval/tasks/blimp/existential_there_object_raising.yaml |multiple_choice |\n|blimp_existential_there_quantifiers_1 |lm_eval/tasks/blimp/existential_there_quantifiers_1.yaml |multiple_choice |\n|blimp_existential_there_quantifiers_2 |lm_eval/tasks/blimp/existential_there_quantifiers_2.yaml |multiple_choice |\n|blimp_existential_there_subject_raising |lm_eval/tasks/blimp/existential_there_subject_raising.yaml |multiple_choice |\n|blimp_expletive_it_object_raising |lm_eval/tasks/blimp/expletive_it_object_raising.yaml |multiple_choice |\n|blimp_inchoative |lm_eval/tasks/blimp/inchoative.yaml |multiple_choice |\n|blimp_intransitive |lm_eval/tasks/blimp/intransitive.yaml |multiple_choice |\n|blimp_irregular_past_participle_adjectives |lm_eval/tasks/blimp/irregular_past_participle_adjectives.yaml |multiple_choice |\n|blimp_irregular_past_participle_verbs |lm_eval/tasks/blimp/irregular_past_participle_verbs.yaml |multiple_choice |\n|blimp_irregular_plural_subject_verb_agreement_1 |lm_eval/tasks/blimp/irregular_plural_subject_verb_agreement_1.yaml |multiple_choice |\n|blimp_irregular_plural_subject_verb_agreement_2 |lm_eval/tasks/blimp/irregular_plural_subject_verb_agreement_2.yaml |multiple_choice |\n|blimp_left_branch_island_echo_question |lm_eval/tasks/blimp/left_branch_island_echo_question.yaml |multiple_choice |\n|blimp_left_branch_island_simple_question |lm_eval/tasks/blimp/left_branch_island_simple_question.yaml |multiple_choice |\n|blimp_matrix_question_npi_licensor_present |lm_eval/tasks/blimp/matrix_question_npi_licensor_present.yaml |multiple_choice |\n|blimp_npi_present_1 |lm_eval/tasks/blimp/npi_present_1.yaml |multiple_choice |\n|blimp_npi_present_2 |lm_eval/tasks/blimp/npi_present_2.yaml |multiple_choice |\n|blimp_only_npi_licensor_present |lm_eval/tasks/blimp/only_npi_licensor_present.yaml |multiple_choice |\n|blimp_only_npi_scope |lm_eval/tasks/blimp/only_npi_scope.yaml |multiple_choice |\n|blimp_passive_1 |lm_eval/tasks/blimp/passive_1.yaml |multiple_choice |\n|blimp_passive_2 |lm_eval/tasks/blimp/passive_2.yaml |multiple_choice |\n|blimp_principle_A_c_command |lm_eval/tasks/blimp/principle_A_c_command.yaml |multiple_choice |\n|blimp_principle_A_case_1 |lm_eval/tasks/blimp/principle_A_case_1.yaml |multiple_choice |\n|blimp_principle_A_case_2 |lm_eval/tasks/blimp/principle_A_case_2.yaml |multiple_choice |\n|blimp_principle_A_domain_1 |lm_eval/tasks/blimp/principle_A_domain_1.yaml |multiple_choice |\n|blimp_principle_A_domain_2 |lm_eval/tasks/blimp/principle_A_domain_2.yaml |multiple_choice |\n|blimp_principle_A_domain_3 |lm_eval/tasks/blimp/principle_A_domain_3.yaml |multiple_choice |\n|blimp_principle_A_reconstruction |lm_eval/tasks/blimp/principle_A_reconstruction.yaml |multiple_choice |\n|blimp_regular_plural_subject_verb_agreement_1 |lm_eval/tasks/blimp/regular_plural_subject_verb_agreement_1.yaml |multiple_choice |\n|blimp_regular_plural_subject_verb_agreement_2 |lm_eval/tasks/blimp/regular_plural_subject_verb_agreement_2.yaml |multiple_choice |\n|blimp_sentential_negation_npi_licensor_present |lm_eval/tasks/blimp/sentential_negation_npi_licensor_present.yaml |multiple_choice |\n|blimp_sentential_negation_npi_scope |lm_eval/tasks/blimp/sentential_negation_npi_scope.yaml |multiple_choice |\n|blimp_sentential_subject_island |lm_eval/tasks/blimp/sentential_subject_island.yaml |multiple_choice |\n|blimp_superlative_quantifiers_1 |lm_eval/tasks/blimp/superlative_quantifiers_1.yaml |multiple_choice |\n|blimp_superlative_quantifiers_2 |lm_eval/tasks/blimp/superlative_quantifiers_2.yaml |multiple_choice |\n|blimp_tough_vs_raising_1 |lm_eval/tasks/blimp/tough_vs_raising_1.yaml |multiple_choice |\n|blimp_tough_vs_raising_2 |lm_eval/tasks/blimp/tough_vs_raising_2.yaml |multiple_choice |\n|blimp_transitive |lm_eval/tasks/blimp/transitive.yaml |multiple_choice |\n|blimp_wh_island |lm_eval/tasks/blimp/wh_island.yaml |multiple_choice |\n|blimp_wh_questions_object_gap |lm_eval/tasks/blimp/wh_questions_object_gap.yaml |multiple_choice |\n|blimp_wh_questions_subject_gap |lm_eval/tasks/blimp/wh_questions_subject_gap.yaml |multiple_choice |\n|blimp_wh_questions_subject_gap_long_distance |lm_eval/tasks/blimp/wh_questions_subject_gap_long_distance.yaml |multiple_choice |\n|blimp_wh_vs_that_no_gap |lm_eval/tasks/blimp/wh_vs_that_no_gap.yaml |multiple_choice |\n|blimp_wh_vs_that_no_gap_long_distance |lm_eval/tasks/blimp/wh_vs_that_no_gap_long_distance.yaml |multiple_choice |\n|blimp_wh_vs_that_with_gap |lm_eval/tasks/blimp/wh_vs_that_with_gap.yaml |multiple_choice |\n|blimp_wh_vs_that_with_gap_long_distance |lm_eval/tasks/blimp/wh_vs_that_with_gap_long_distance.yaml |multiple_choice |\n|boolq |lm_eval/tasks/super_glue/boolq/default.yaml |multiple_choice |\n|boolq-seq2seq |lm_eval/tasks/super_glue/boolq/seq2seq.yaml |generate_until |\n|cb |lm_eval/tasks/super_glue/cb/default.yaml |multiple_choice |\n|ceval-valid_accountant |lm_eval/tasks/ceval/ceval-valid_accountant.yaml |multiple_choice |\n|ceval-valid_advanced_mathematics |lm_eval/tasks/ceval/ceval-valid_advanced_mathematics.yaml |multiple_choice |\n|ceval-valid_art_studies |lm_eval/tasks/ceval/ceval-valid_art_studies.yaml |multiple_choice |\n|ceval-valid_basic_medicine |lm_eval/tasks/ceval/ceval-valid_basic_medicine.yaml |multiple_choice |\n|ceval-valid_business_administration |lm_eval/tasks/ceval/ceval-valid_business_administration.yaml |multiple_choice |\n|ceval-valid_chinese_language_and_literature |lm_eval/tasks/ceval/ceval-valid_chinese_language_and_literature.yaml |multiple_choice |\n|ceval-valid_civil_servant |lm_eval/tasks/ceval/ceval-valid_civil_servant.yaml |multiple_choice |\n|ceval-valid_clinical_medicine |lm_eval/tasks/ceval/ceval-valid_clinical_medicine.yaml |multiple_choice |\n|ceval-valid_college_chemistry |lm_eval/tasks/ceval/ceval-valid_college_chemistry.yaml |multiple_choice |\n|ceval-valid_college_economics |lm_eval/tasks/ceval/ceval-valid_college_economics.yaml |multiple_choice |\n|ceval-valid_college_physics |lm_eval/tasks/ceval/ceval-valid_college_physics.yaml |multiple_choice |\n|ceval-valid_college_programming |lm_eval/tasks/ceval/ceval-valid_college_programming.yaml |multiple_choice |\n|ceval-valid_computer_architecture |lm_eval/tasks/ceval/ceval-valid_computer_architecture.yaml |multiple_choice |\n|ceval-valid_computer_network |lm_eval/tasks/ceval/ceval-valid_computer_network.yaml |multiple_choice |\n|ceval-valid_discrete_mathematics |lm_eval/tasks/ceval/ceval-valid_discrete_mathematics.yaml |multiple_choice |\n|ceval-valid_education_science |lm_eval/tasks/ceval/ceval-valid_education_science.yaml |multiple_choice |\n|ceval-valid_electrical_engineer |lm_eval/tasks/ceval/ceval-valid_electrical_engineer.yaml |multiple_choice |\n|ceval-valid_environmental_impact_assessment_engineer |lm_eval/tasks/ceval/ceval-valid_environmental_impact_assessment_engineer.yaml |multiple_choice |\n|ceval-valid_fire_engineer |lm_eval/tasks/ceval/ceval-valid_fire_engineer.yaml |multiple_choice |\n|ceval-valid_high_school_biology |lm_eval/tasks/ceval/ceval-valid_high_school_biology.yaml |multiple_choice |\n|ceval-valid_high_school_chemistry |lm_eval/tasks/ceval/ceval-valid_high_school_chemistry.yaml |multiple_choice |\n|ceval-valid_high_school_chinese |lm_eval/tasks/ceval/ceval-valid_high_school_chinese.yaml |multiple_choice |\n|ceval-valid_high_school_geography |lm_eval/tasks/ceval/ceval-valid_high_school_geography.yaml |multiple_choice |\n|ceval-valid_high_school_history |lm_eval/tasks/ceval/ceval-valid_high_school_history.yaml |multiple_choice |\n|ceval-valid_high_school_mathematics |lm_eval/tasks/ceval/ceval-valid_high_school_mathematics.yaml |multiple_choice |\n|ceval-valid_high_school_physics |lm_eval/tasks/ceval/ceval-valid_high_school_physics.yaml |multiple_choice |\n|ceval-valid_high_school_politics |lm_eval/tasks/ceval/ceval-valid_high_school_politics.yaml |multiple_choice |\n|ceval-valid_ideological_and_moral_cultivation |lm_eval/tasks/ceval/ceval-valid_ideological_and_moral_cultivation.yaml |multiple_choice |\n|ceval-valid_law |lm_eval/tasks/ceval/ceval-valid_law.yaml |multiple_choice |\n|ceval-valid_legal_professional |lm_eval/tasks/ceval/ceval-valid_legal_professional.yaml |multiple_choice |\n|ceval-valid_logic |lm_eval/tasks/ceval/ceval-valid_logic.yaml |multiple_choice |\n|ceval-valid_mao_zedong_thought |lm_eval/tasks/ceval/ceval-valid_mao_zedong_thought.yaml |multiple_choice |\n|ceval-valid_marxism |lm_eval/tasks/ceval/ceval-valid_marxism.yaml |multiple_choice |\n|ceval-valid_metrology_engineer |lm_eval/tasks/ceval/ceval-valid_metrology_engineer.yaml |multiple_choice |\n|ceval-valid_middle_school_biology |lm_eval/tasks/ceval/ceval-valid_middle_school_biology.yaml |multiple_choice |\n|ceval-valid_middle_school_chemistry |lm_eval/tasks/ceval/ceval-valid_middle_school_chemistry.yaml |multiple_choice |\n|ceval-valid_middle_school_geography |lm_eval/tasks/ceval/ceval-valid_middle_school_geography.yaml |multiple_choice |\n|ceval-valid_middle_school_history |lm_eval/tasks/ceval/ceval-valid_middle_school_history.yaml |multiple_choice |\n|ceval-valid_middle_school_mathematics |lm_eval/tasks/ceval/ceval-valid_middle_school_mathematics.yaml |multiple_choice |\n|ceval-valid_middle_school_physics |lm_eval/tasks/ceval/ceval-valid_middle_school_physics.yaml |multiple_choice |\n|ceval-valid_middle_school_politics |lm_eval/tasks/ceval/ceval-valid_middle_school_politics.yaml |multiple_choice |\n|ceval-valid_modern_chinese_history |lm_eval/tasks/ceval/ceval-valid_modern_chinese_history.yaml |multiple_choice |\n|ceval-valid_operating_system |lm_eval/tasks/ceval/ceval-valid_operating_system.yaml |multiple_choice |\n|ceval-valid_physician |lm_eval/tasks/ceval/ceval-valid_physician.yaml |multiple_choice |\n|ceval-valid_plant_protection |lm_eval/tasks/ceval/ceval-valid_plant_protection.yaml |multiple_choice |\n|ceval-valid_probability_and_statistics |lm_eval/tasks/ceval/ceval-valid_probability_and_statistics.yaml |multiple_choice |\n|ceval-valid_professional_tour_guide |lm_eval/tasks/ceval/ceval-valid_professional_tour_guide.yaml |multiple_choice |\n|ceval-valid_sports_science |lm_eval/tasks/ceval/ceval-valid_sports_science.yaml |multiple_choice |\n|ceval-valid_tax_accountant |lm_eval/tasks/ceval/ceval-valid_tax_accountant.yaml |multiple_choice |\n|ceval-valid_teacher_qualification |lm_eval/tasks/ceval/ceval-valid_teacher_qualification.yaml |multiple_choice |\n|ceval-valid_urban_and_rural_planner |lm_eval/tasks/ceval/ceval-valid_urban_and_rural_planner.yaml |multiple_choice |\n|ceval-valid_veterinary_medicine |lm_eval/tasks/ceval/ceval-valid_veterinary_medicine.yaml |multiple_choice |\n|cmmlu_agronomy |lm_eval/tasks/cmmlu/cmmlu_default_agronomy.yaml |multiple_choice |\n|cmmlu_anatomy |lm_eval/tasks/cmmlu/cmmlu_anatomy.yaml |multiple_choice |\n|cmmlu_ancient_chinese |lm_eval/tasks/cmmlu/cmmlu_default_ancient_chinese.yaml |multiple_choice |\n|cmmlu_arts |lm_eval/tasks/cmmlu/cmmlu_default_arts.yaml |multiple_choice |\n|cmmlu_astronomy |lm_eval/tasks/cmmlu/cmmlu_default_astronomy.yaml |multiple_choice |\n|cmmlu_business_ethics |lm_eval/tasks/cmmlu/cmmlu_default_business_ethics.yaml |multiple_choice |\n|cmmlu_chinese_civil_service_exam |lm_eval/tasks/cmmlu/cmmlu_chinese_civil_service_exam.yaml |multiple_choice |\n|cmmlu_chinese_driving_rule |lm_eval/tasks/cmmlu/cmmlu_default_chinese_driving_rule.yaml |multiple_choice |\n|cmmlu_chinese_food_culture |lm_eval/tasks/cmmlu/cmmlu_default_chinese_food_culture.yaml |multiple_choice |\n|cmmlu_chinese_foreign_policy |lm_eval/tasks/cmmlu/cmmlu_default_chinese_foreign_policy.yaml |multiple_choice |\n|cmmlu_chinese_history |lm_eval/tasks/cmmlu/cmmlu_chinese_history.yaml |multiple_choice |\n|cmmlu_chinese_literature |lm_eval/tasks/cmmlu/cmmlu_default_chinese_literature.yaml |multiple_choice |\n|cmmlu_chinese_teacher_qualification |lm_eval/tasks/cmmlu/cmmlu_default_chinese_teacher_qualification.yaml |multiple_choice |\n|cmmlu_clinical_knowledge |lm_eval/tasks/cmmlu/cmmlu_default_clinical_knowledge.yaml |multiple_choice |\n|cmmlu_college_actuarial_science |lm_eval/tasks/cmmlu/cmmlu_default_college_actuarial_science.yaml |multiple_choice |\n|cmmlu_college_education |lm_eval/tasks/cmmlu/cmmlu_college_education.yaml |multiple_choice |\n|cmmlu_college_engineering_hydrology |lm_eval/tasks/cmmlu/cmmlu_college_engineering_hydrology.yaml |multiple_choice |\n|cmmlu_college_law |lm_eval/tasks/cmmlu/cmmlu_default_college_law.yaml |multiple_choice |\n|cmmlu_college_mathematics |lm_eval/tasks/cmmlu/cmmlu_default_college_mathematics.yaml |multiple_choice |\n|cmmlu_college_medical_statistics |lm_eval/tasks/cmmlu/cmmlu_default_college_medical_statistics.yaml |multiple_choice |\n|cmmlu_college_medicine |lm_eval/tasks/cmmlu/cmmlu_default_college_medicine.yaml |multiple_choice |\n|cmmlu_computer_science |lm_eval/tasks/cmmlu/cmmlu_default_computer_science.yaml |multiple_choice |\n|cmmlu_computer_security |lm_eval/tasks/cmmlu/cmmlu_computer_security.yaml |multiple_choice |\n|cmmlu_conceptual_physics |lm_eval/tasks/cmmlu/cmmlu_default_conceptual_physics.yaml |multiple_choice |\n|cmmlu_construction_project_management |lm_eval/tasks/cmmlu/cmmlu_default_construction_project_management.yaml |multiple_choice |\n|cmmlu_economics |lm_eval/tasks/cmmlu/cmmlu_economics.yaml |multiple_choice |\n|cmmlu_education |lm_eval/tasks/cmmlu/cmmlu_default_education.yaml |multiple_choice |\n|cmmlu_electrical_engineering |lm_eval/tasks/cmmlu/cmmlu_default_electrical_engineering.yaml |multiple_choice |\n|cmmlu_elementary_chinese |lm_eval/tasks/cmmlu/cmmlu_default_elementary_chinese.yaml |multiple_choice |\n|cmmlu_elementary_commonsense |lm_eval/tasks/cmmlu/cmmlu_default_elementary_commonsense.yaml |multiple_choice |\n|cmmlu_elementary_information_and_technology |lm_eval/tasks/cmmlu/cmmlu_elementary_information_and_technology.yaml |multiple_choice |\n|cmmlu_elementary_mathematics |lm_eval/tasks/cmmlu/cmmlu_default_elementary_mathematics.yaml |multiple_choice |\n|cmmlu_ethnology |lm_eval/tasks/cmmlu/cmmlu_ethnology.yaml |multiple_choice |\n|cmmlu_food_science |lm_eval/tasks/cmmlu/cmmlu_default_food_science.yaml |multiple_choice |\n|cmmlu_genetics |lm_eval/tasks/cmmlu/cmmlu_default_genetics.yaml |multiple_choice |\n|cmmlu_global_facts |lm_eval/tasks/cmmlu/cmmlu_global_facts.yaml |multiple_choice |\n|cmmlu_high_school_biology |lm_eval/tasks/cmmlu/cmmlu_default_high_school_biology.yaml |multiple_choice |\n|cmmlu_high_school_chemistry |lm_eval/tasks/cmmlu/cmmlu_default_high_school_chemistry.yaml |multiple_choice |\n|cmmlu_high_school_geography |lm_eval/tasks/cmmlu/cmmlu_default_high_school_geography.yaml |multiple_choice |\n|cmmlu_high_school_mathematics |lm_eval/tasks/cmmlu/cmmlu_default_high_school_mathematics.yaml |multiple_choice |\n|cmmlu_high_school_physics |lm_eval/tasks/cmmlu/cmmlu_default_high_school_physics.yaml |multiple_choice |\n|cmmlu_high_school_politics |lm_eval/tasks/cmmlu/cmmlu_default_high_school_politics.yaml |multiple_choice |\n|cmmlu_human_sexuality |lm_eval/tasks/cmmlu/cmmlu_default_human_sexuality.yaml |multiple_choice |\n|cmmlu_international_law |lm_eval/tasks/cmmlu/cmmlu_international_law.yaml |multiple_choice |\n|cmmlu_journalism |lm_eval/tasks/cmmlu/cmmlu_default_journalism.yaml |multiple_choice |\n|cmmlu_jurisprudence |lm_eval/tasks/cmmlu/cmmlu_default_jurisprudence.yaml |multiple_choice |\n|cmmlu_legal_and_moral_basis |lm_eval/tasks/cmmlu/cmmlu_legal_and_moral_basis.yaml |multiple_choice |\n|cmmlu_logical |lm_eval/tasks/cmmlu/cmmlu_logical.yaml |multiple_choice |\n|cmmlu_machine_learning |lm_eval/tasks/cmmlu/cmmlu_default_machine_learning.yaml |multiple_choice |\n|cmmlu_management |lm_eval/tasks/cmmlu/cmmlu_management.yaml |multiple_choice |\n|cmmlu_marketing |lm_eval/tasks/cmmlu/cmmlu_marketing.yaml |multiple_choice |\n|cmmlu_marxist_theory |lm_eval/tasks/cmmlu/cmmlu_marxist_theory.yaml |multiple_choice |\n|cmmlu_modern_chinese |lm_eval/tasks/cmmlu/cmmlu_default_modern_chinese.yaml |multiple_choice |\n|cmmlu_nutrition |lm_eval/tasks/cmmlu/cmmlu_default_nutrition.yaml |multiple_choice |\n|cmmlu_philosophy |lm_eval/tasks/cmmlu/cmmlu_default_philosophy.yaml |multiple_choice |\n|cmmlu_professional_accounting |lm_eval/tasks/cmmlu/cmmlu_default_professional_accounting.yaml |multiple_choice |\n|cmmlu_professional_law |lm_eval/tasks/cmmlu/cmmlu_default_professional_law.yaml |multiple_choice |\n|cmmlu_professional_medicine |lm_eval/tasks/cmmlu/cmmlu_default_professional_medicine.yaml |multiple_choice |\n|cmmlu_professional_psychology |lm_eval/tasks/cmmlu/cmmlu_professional_psychology.yaml |multiple_choice |\n|cmmlu_public_relations |lm_eval/tasks/cmmlu/cmmlu_default_public_relations.yaml |multiple_choice |\n|cmmlu_security_study |lm_eval/tasks/cmmlu/cmmlu_security_study.yaml |multiple_choice |\n|cmmlu_sociology |lm_eval/tasks/cmmlu/cmmlu_default_sociology.yaml |multiple_choice |\n|cmmlu_sports_science |lm_eval/tasks/cmmlu/cmmlu_sports_science.yaml |multiple_choice |\n|cmmlu_traditional_chinese_medicine |lm_eval/tasks/cmmlu/cmmlu_traditional_chinese_medicine.yaml |multiple_choice |\n|cmmlu_virology |lm_eval/tasks/cmmlu/cmmlu_default_virology.yaml |multiple_choice |\n|cmmlu_world_history |lm_eval/tasks/cmmlu/cmmlu_world_history.yaml |multiple_choice |\n|cmmlu_world_religions |lm_eval/tasks/cmmlu/cmmlu_default_world_religions.yaml |multiple_choice |\n|code2text_go |lm_eval/tasks/code_x_glue/code-text/go.yaml |generate_until |\n|code2text_java |lm_eval/tasks/code_x_glue/code-text/java.yaml |generate_until |\n|code2text_javascript |lm_eval/tasks/code_x_glue/code-text/javascript.yaml |generate_until |\n|code2text_php |lm_eval/tasks/code_x_glue/code-text/php.yaml |generate_until |\n|code2text_python |lm_eval/tasks/code_x_glue/code-text/python.yaml |generate_until |\n|code2text_ruby |lm_eval/tasks/code_x_glue/code-text/ruby.yaml |generate_until |\n|cola |lm_eval/tasks/glue/cola/default.yaml |multiple_choice |\n|commonsense_qa |lm_eval/tasks/commonsense_qa/default.yaml |multiple_choice |\n|copa |lm_eval/tasks/super_glue/copa/default.yaml |multiple_choice |\n|copa_ar |lm_eval/tasks/alghafa/copa_ar/copa_ar.yaml |multiple_choice |\n|copal_id_colloquial |lm_eval/tasks/copal_id/colloquial.yaml |multiple_choice |\n|copal_id_standard |lm_eval/tasks/copal_id/standard.yaml |multiple_choice |\n|coqa |lm_eval/tasks/coqa/default.yaml |generate_until |\n|crows_pairs_english |lm_eval/tasks/crows_pairs/crows_pairs_english.yaml |multiple_choice |\n|crows_pairs_english_age |lm_eval/tasks/crows_pairs/crows_pairs_english_age.yaml |multiple_choice |\n|crows_pairs_english_autre |lm_eval/tasks/crows_pairs/crows_pairs_english_autre.yaml |multiple_choice |\n|crows_pairs_english_disability |lm_eval/tasks/crows_pairs/crows_pairs_english_disability.yaml |multiple_choice |\n|crows_pairs_english_gender |lm_eval/tasks/crows_pairs/crows_pairs_english_gender.yaml |multiple_choice |\n|crows_pairs_english_nationality |lm_eval/tasks/crows_pairs/crows_pairs_english_nationality.yaml |multiple_choice |\n|crows_pairs_english_physical_appearance |lm_eval/tasks/crows_pairs/crows_pairs_english_physical_appearance.yaml |multiple_choice |\n|crows_pairs_english_race_color |lm_eval/tasks/crows_pairs/crows_pairs_english_race_color.yaml |multiple_choice |\n|crows_pairs_english_religion |lm_eval/tasks/crows_pairs/crows_pairs_english_religion.yaml |multiple_choice |\n|crows_pairs_english_sexual_orientation |lm_eval/tasks/crows_pairs/crows_pairs_english_sexual_orientation.yaml |multiple_choice |\n|crows_pairs_english_socioeconomic |lm_eval/tasks/crows_pairs/crows_pairs_english_socioeconomic.yaml |multiple_choice |\n|crows_pairs_french |lm_eval/tasks/crows_pairs/crows_pairs_french.yaml |multiple_choice |\n|crows_pairs_french_age |lm_eval/tasks/crows_pairs/crows_pairs_french_age.yaml |multiple_choice |\n|crows_pairs_french_autre |lm_eval/tasks/crows_pairs/crows_pairs_french_autre.yaml |multiple_choice |\n|crows_pairs_french_disability |lm_eval/tasks/crows_pairs/crows_pairs_french_disability.yaml |multiple_choice |\n|crows_pairs_french_gender |lm_eval/tasks/crows_pairs/crows_pairs_french_gender.yaml |multiple_choice |\n|crows_pairs_french_nationality |lm_eval/tasks/crows_pairs/crows_pairs_french_nationality.yaml |multiple_choice |\n|crows_pairs_french_physical_appearance |lm_eval/tasks/crows_pairs/crows_pairs_french_physical_appearance.yaml |multiple_choice |\n|crows_pairs_french_race_color |lm_eval/tasks/crows_pairs/crows_pairs_french_race_color.yaml |multiple_choice |\n|crows_pairs_french_religion |lm_eval/tasks/crows_pairs/crows_pairs_french_religion.yaml |multiple_choice |\n|crows_pairs_french_sexual_orientation |lm_eval/tasks/crows_pairs/crows_pairs_french_sexual_orientation.yaml |multiple_choice |\n|crows_pairs_french_socioeconomic |lm_eval/tasks/crows_pairs/crows_pairs_french_socioeconomic.yaml |multiple_choice |\n|csatqa_gr |lm_eval/tasks/csatqa/csatqa_gr.yaml |multiple_choice |\n|csatqa_li |lm_eval/tasks/csatqa/csatqa_li.yaml |multiple_choice |\n|csatqa_rch |lm_eval/tasks/csatqa/csatqa_rch.yaml |multiple_choice |\n|csatqa_rcs |lm_eval/tasks/csatqa/csatqa_rcs.yaml |multiple_choice |\n|csatqa_rcss |lm_eval/tasks/csatqa/csatqa_rcss.yaml |multiple_choice |\n|csatqa_wr |lm_eval/tasks/csatqa/csatqa_wr.yaml |multiple_choice |\n|cycle_letters |lm_eval/tasks/unscramble/cycle_letters.yaml |generate_until |\n|drop |lm_eval/tasks/drop/default.yaml |generate_until |\n|epec_koref_bin |lm_eval/tasks/basqueglue/coref.yaml |multiple_choice |\n|eq_bench |lm_eval/tasks/eq_bench/default.yaml |generate_until |\n|ethics_cm |lm_eval/tasks/hendrycks_ethics/commonsense.yaml |multiple_choice |\n|ethics_deontology |lm_eval/tasks/hendrycks_ethics/deontology.yaml |multiple_choice |\n|ethics_justice |lm_eval/tasks/hendrycks_ethics/justice.yaml |multiple_choice |\n|ethics_utilitarianism |lm_eval/tasks/hendrycks_ethics/utilitarianism.yaml |multiple_choice |\n|ethics_virtue |lm_eval/tasks/hendrycks_ethics/virtue.yaml |multiple_choice |\n|eus_exams_es_ejadministrativo |lm_eval/tasks/eus_exams/eus_exams_es_ejadministrativo.yaml |multiple_choice |\n|eus_exams_es_ejauxiliar |lm_eval/tasks/eus_exams/eus_exams_es_ejauxiliar.yaml |multiple_choice |\n|eus_exams_es_ejsubalterno |lm_eval/tasks/eus_exams/eus_exams_es_ejsubalterno.yaml |multiple_choice |\n|eus_exams_es_ejtecnico |lm_eval/tasks/eus_exams/eus_exams_es_ejtecnico.yaml |multiple_choice |\n|eus_exams_es_opeayuntamientovitoria |lm_eval/tasks/eus_exams/eus_exams_es_opeayuntamientovitoria.yaml |multiple_choice |\n|eus_exams_es_opebilbao |lm_eval/tasks/eus_exams/eus_exams_es_opebilbao.yaml |multiple_choice |\n|eus_exams_es_opeehuadmin |lm_eval/tasks/eus_exams/eus_exams_es_opeehuadmin.yaml |multiple_choice |\n|eus_exams_es_opeehuaux |lm_eval/tasks/eus_exams/eus_exams_es_opeehuaux.yaml |multiple_choice |\n|eus_exams_es_opeehubiblio |lm_eval/tasks/eus_exams/eus_exams_es_opeehubiblio.yaml |multiple_choice |\n|eus_exams_es_opeehuderecho |lm_eval/tasks/eus_exams/eus_exams_es_opeehuderecho.yaml |multiple_choice |\n|eus_exams_es_opeehueconomicas |lm_eval/tasks/eus_exams/eus_exams_es_opeehueconomicas.yaml |multiple_choice |\n|eus_exams_es_opeehuempresariales |lm_eval/tasks/eus_exams/eus_exams_es_opeehuempresariales.yaml |multiple_choice |\n|eus_exams_es_opeehusubalterno |lm_eval/tasks/eus_exams/eus_exams_es_opeehusubalterno.yaml |multiple_choice |\n|eus_exams_es_opeehutecnico |lm_eval/tasks/eus_exams/eus_exams_es_opeehutecnico.yaml |multiple_choice |\n|eus_exams_es_opeehutecnicob |lm_eval/tasks/eus_exams/eus_exams_es_opeehutecnicob.yaml |multiple_choice |\n|eus_exams_es_opeosakiadmin |lm_eval/tasks/eus_exams/eus_exams_es_opeosakiadmin.yaml |multiple_choice |\n|eus_exams_es_opeosakiaux |lm_eval/tasks/eus_exams/eus_exams_es_opeosakiaux.yaml |multiple_choice |\n|eus_exams_es_opeosakiauxenf |lm_eval/tasks/eus_exams/eus_exams_es_opeosakiauxenf.yaml |multiple_choice |\n|eus_exams_es_opeosakicelador |lm_eval/tasks/eus_exams/eus_exams_es_opeosakicelador.yaml |multiple_choice |\n|eus_exams_es_opeosakienf |lm_eval/tasks/eus_exams/eus_exams_es_opeosakienf.yaml |multiple_choice |\n|eus_exams_es_opeosakijuridico |lm_eval/tasks/eus_exams/eus_exams_es_opeosakijuridico.yaml |multiple_choice |\n|eus_exams_es_opeosakioperario |lm_eval/tasks/eus_exams/eus_exams_es_opeosakioperario.yaml |multiple_choice |\n|eus_exams_es_opeosakitecnico |lm_eval/tasks/eus_exams/eus_exams_es_opeosakitecnico.yaml |multiple_choice |\n|eus_exams_es_opeosakivarios |lm_eval/tasks/eus_exams/eus_exams_es_opeosakivarios.yaml |multiple_choice |\n|eus_exams_es_osakidetza1c |lm_eval/tasks/eus_exams/eus_exams_es_osakidetza1c.yaml |multiple_choice |\n|eus_exams_es_osakidetza2c |lm_eval/tasks/eus_exams/eus_exams_es_osakidetza2c.yaml |multiple_choice |\n|eus_exams_es_osakidetza3c |lm_eval/tasks/eus_exams/eus_exams_es_osakidetza3c.yaml |multiple_choice |\n|eus_exams_es_osakidetza4c |lm_eval/tasks/eus_exams/eus_exams_es_osakidetza4c.yaml |multiple_choice |\n|eus_exams_es_osakidetza5c |lm_eval/tasks/eus_exams/eus_exams_es_osakidetza5c.yaml |multiple_choice |\n|eus_exams_es_osakidetza6c |lm_eval/tasks/eus_exams/eus_exams_es_osakidetza6c.yaml |multiple_choice |\n|eus_exams_es_osakidetza7c |lm_eval/tasks/eus_exams/eus_exams_es_osakidetza7c.yaml |multiple_choice |\n|eus_exams_es_osakidetza8c |lm_eval/tasks/eus_exams/eus_exams_es_osakidetza8c.yaml |multiple_choice |\n|eus_exams_es_osakidetza9c |lm_eval/tasks/eus_exams/eus_exams_es_osakidetza9c.yaml |multiple_choice |\n|eus_exams_eu_ejadministrari |lm_eval/tasks/eus_exams/eus_exams_eu_ejadministrari.yaml |multiple_choice |\n|eus_exams_eu_ejlaguntza |lm_eval/tasks/eus_exams/eus_exams_eu_ejlaguntza.yaml |multiple_choice |\n|eus_exams_eu_ejlaguntzaile |lm_eval/tasks/eus_exams/eus_exams_eu_ejlaguntzaile.yaml |multiple_choice |\n|eus_exams_eu_ejteknikari |lm_eval/tasks/eus_exams/eus_exams_eu_ejteknikari.yaml |multiple_choice |\n|eus_exams_eu_opebilbaoeu |lm_eval/tasks/eus_exams/eus_exams_eu_opebilbaoeu.yaml |multiple_choice |\n|eus_exams_eu_opeehuadmineu |lm_eval/tasks/eus_exams/eus_exams_eu_opeehuadmineu.yaml |multiple_choice |\n|eus_exams_eu_opeehuauxeu |lm_eval/tasks/eus_exams/eus_exams_eu_opeehuauxeu.yaml |multiple_choice |\n|eus_exams_eu_opeehubiblioeu |lm_eval/tasks/eus_exams/eus_exams_eu_opeehubiblioeu.yaml |multiple_choice |\n|eus_exams_eu_opeehuderechoeu |lm_eval/tasks/eus_exams/eus_exams_eu_opeehuderechoeu.yaml |multiple_choice |\n|eus_exams_eu_opeehueconomicaseu |lm_eval/tasks/eus_exams/eus_exams_eu_opeehueconomicaseu.yaml |multiple_choice |\n|eus_exams_eu_opeehuempresarialeseu |lm_eval/tasks/eus_exams/eus_exams_eu_opeehuempresarialeseu.yaml |multiple_choice |\n|eus_exams_eu_opeehusubalternoeu |lm_eval/tasks/eus_exams/eus_exams_eu_opeehusubalternoeu.yaml |multiple_choice |\n|eus_exams_eu_opeehutecnicoeu |lm_eval/tasks/eus_exams/eus_exams_eu_opeehutecnicoeu.yaml |multiple_choice |\n|eus_exams_eu_opeehuteknikarib |lm_eval/tasks/eus_exams/eus_exams_eu_opeehuteknikarib.yaml |multiple_choice |\n|eus_exams_eu_opegasteizkoudala |lm_eval/tasks/eus_exams/eus_exams_eu_opegasteizkoudala.yaml |multiple_choice |\n|eus_exams_eu_opeosakiadmineu |lm_eval/tasks/eus_exams/eus_exams_eu_opeosakiadmineu.yaml |multiple_choice |\n|eus_exams_eu_opeosakiauxenfeu |lm_eval/tasks/eus_exams/eus_exams_eu_opeosakiauxenfeu.yaml |multiple_choice |\n|eus_exams_eu_opeosakiauxeu |lm_eval/tasks/eus_exams/eus_exams_eu_opeosakiauxeu.yaml |multiple_choice |\n|eus_exams_eu_opeosakiceladoreu |lm_eval/tasks/eus_exams/eus_exams_eu_opeosakiceladoreu.yaml |multiple_choice |\n|eus_exams_eu_opeosakienfeu |lm_eval/tasks/eus_exams/eus_exams_eu_opeosakienfeu.yaml |multiple_choice |\n|eus_exams_eu_opeosakioperarioeu |lm_eval/tasks/eus_exams/eus_exams_eu_opeosakioperarioeu.yaml |multiple_choice |\n|eus_exams_eu_opeosakitecnicoeu |lm_eval/tasks/eus_exams/eus_exams_eu_opeosakitecnicoeu.yaml |multiple_choice |\n|eus_exams_eu_opeosakivarioseu |lm_eval/tasks/eus_exams/eus_exams_eu_opeosakivarioseu.yaml |multiple_choice |\n|eus_exams_eu_osakidetza1e |lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza1e.yaml |multiple_choice |\n|eus_exams_eu_osakidetza2e |lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza2e.yaml |multiple_choice |\n|eus_exams_eu_osakidetza3e |lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza3e.yaml |multiple_choice |\n|eus_exams_eu_osakidetza5e |lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza5e.yaml |multiple_choice |\n|eus_exams_eu_osakidetza6e |lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza6e.yaml |multiple_choice |\n|eus_exams_eu_osakidetza7e |lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza7e.yaml |multiple_choice |\n|eus_proficiency |lm_eval/tasks/eus_proficiency/eus_proficiency.yaml |multiple_choice |\n|eus_reading |lm_eval/tasks/eus_reading/eus_reading.yaml |multiple_choice |\n|eus_trivia |lm_eval/tasks/eus_trivia/eus_trivia.yaml |multiple_choice |\n|fld_default |lm_eval/tasks/fld/fld_default.yaml | |\n|fld_logical_formula_default |lm_eval/tasks/fld/fld_logical_formula_default.yaml | |\n|fld_logical_formula_star |lm_eval/tasks/fld/fld_logical_formula_star.yaml | |\n|fld_star |lm_eval/tasks/fld/fld_star.yaml | |\n|french_bench_arc_challenge |lm_eval/tasks/french_bench/french_bench_arc_challenge.yaml |multiple_choice |\n|french_bench_boolqa |lm_eval/tasks/french_bench/french_bench_boolqa.yaml |multiple_choice |\n|french_bench_fquadv2 |lm_eval/tasks/french_bench/french_bench_fquadv2.yaml |generate_until |\n|french_bench_fquadv2_bool |lm_eval/tasks/french_bench/french_bench_fquadv2_bool.yaml |multiple_choice |\n|french_bench_fquadv2_genq |lm_eval/tasks/french_bench/french_bench_fquadv2_genq.yaml |generate_until |\n|french_bench_fquadv2_hasAns |lm_eval/tasks/french_bench/french_bench_fquadv2_hasAns.yaml |generate_until |\n|french_bench_grammar |lm_eval/tasks/french_bench/french_bench_grammar.yaml |multiple_choice |\n|french_bench_hellaswag |lm_eval/tasks/french_bench/french_bench_hellaswag.yaml |multiple_choice |\n|french_bench_multifquad |lm_eval/tasks/french_bench/french_bench_multifquad.yaml |generate_until |\n|french_bench_opus_perplexity |lm_eval/tasks/french_bench/french_bench_opus_perplexity.yaml |loglikelihood_rolling|\n|french_bench_orangesum_abstract |lm_eval/tasks/french_bench/french_bench_orangesum_abstract.yaml |generate_until |\n|french_bench_orangesum_title |lm_eval/tasks/french_bench/french_bench_orangesum_title.yaml |generate_until |\n|french_bench_reading_comp |lm_eval/tasks/french_bench/french_bench_reading_comp.yaml |multiple_choice |\n|french_bench_topic_based_nli |lm_eval/tasks/french_bench/french_bench_topic_based_nli.yaml |multiple_choice |\n|french_bench_trivia |lm_eval/tasks/french_bench/french_bench_trivia.yaml |generate_until |\n|french_bench_vocab |lm_eval/tasks/french_bench/french_bench_vocab.yaml |multiple_choice |\n|french_bench_wikitext_fr |lm_eval/tasks/french_bench/french_bench_wikitext_fr.yaml |loglikelihood_rolling|\n|french_bench_xnli |lm_eval/tasks/french_bench/french_bench_xnli.yaml |multiple_choice |\n|glianorex |lm_eval/tasks/glianorex/glianorex.yaml |multiple_choice |\n|glianorex_en |lm_eval/tasks/glianorex/glianorex_en.yaml |multiple_choice |\n|glianorex_fr |lm_eval/tasks/glianorex/glianorex_fr.yaml |multiple_choice |\n|gpqa_diamond_cot_n_shot |lm_eval/tasks/gpqa/cot_n_shot/gpqa_diamond_cot_n_shot.yaml |generate_until |\n|gpqa_diamond_cot_zeroshot |lm_eval/tasks/gpqa/cot_zeroshot/gpqa_diamond_cot_zeroshot.yaml |generate_until |\n|gpqa_diamond_generative_n_shot |lm_eval/tasks/gpqa/generative/gpqa_diamond_generative_n_shot.yaml |generate_until |\n|gpqa_diamond_n_shot |lm_eval/tasks/gpqa/n_shot/gpqa_diamond_n_shot.yaml |multiple_choice |\n|gpqa_diamond_zeroshot |lm_eval/tasks/gpqa/zeroshot/gpqa_diamond_zeroshot.yaml |multiple_choice |\n|gpqa_extended_cot_n_shot |lm_eval/tasks/gpqa/cot_n_shot/gpqa_extended_cot_n_shot.yaml |generate_until |\n|gpqa_extended_cot_zeroshot |lm_eval/tasks/gpqa/cot_zeroshot/gpqa_extended_cot_zeroshot.yaml |generate_until |\n|gpqa_extended_generative_n_shot |lm_eval/tasks/gpqa/generative/gpqa_extended_generative_n_shot.yaml |generate_until |\n|gpqa_extended_n_shot |lm_eval/tasks/gpqa/n_shot/gpqa_extended_n_shot.yaml |multiple_choice |\n|gpqa_extended_zeroshot |lm_eval/tasks/gpqa/zeroshot/gpqa_extended_zeroshot.yaml |multiple_choice |\n|gpqa_main_cot_n_shot |lm_eval/tasks/gpqa/cot_n_shot/gpqa_main_cot_n_shot.yaml |generate_until |\n|gpqa_main_cot_zeroshot |lm_eval/tasks/gpqa/cot_zeroshot/gpqa_main_cot_zeroshot.yaml |generate_until |\n|gpqa_main_generative_n_shot |lm_eval/tasks/gpqa/generative/gpqa_main_generative_n_shot.yaml |generate_until |\n|gpqa_main_n_shot |lm_eval/tasks/gpqa/n_shot/gpqa_main_n_shot.yaml |multiple_choice |\n|gpqa_main_zeroshot |lm_eval/tasks/gpqa/zeroshot/gpqa_main_zeroshot.yaml |multiple_choice |\n|gsm8k |lm_eval/tasks/gsm8k/gsm8k.yaml |generate_until |\n|gsm8k_cot |lm_eval/tasks/gsm8k/gsm8k-cot.yaml |generate_until |\n|gsm8k_cot_llama |lm_eval/tasks/gsm8k/gsm8k-cot-llama.yaml |generate_until |\n|gsm8k_cot_self_consistency |lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml |generate_until |\n|gsm8k_cot_zeroshot |lm_eval/tasks/gsm8k/gsm8k-cot-zeroshot.yaml |generate_until |\n|gsm_plus |lm_eval/tasks/gsm_plus/gsm_plus.yaml |generate_until |\n|gsm_plus_mini |lm_eval/tasks/gsm_plus/gsm_plus_mini.yaml |generate_until |\n|haerae_general_knowledge |lm_eval/tasks/haerae/haerae_gk.yaml |multiple_choice |\n|haerae_history |lm_eval/tasks/haerae/haerae_hi.yaml |multiple_choice |\n|haerae_loan_word |lm_eval/tasks/haerae/haerae_lw.yaml |multiple_choice |\n|haerae_rare_word |lm_eval/tasks/haerae/haerae_rw.yaml |multiple_choice |\n|haerae_standard_nomenclature |lm_eval/tasks/haerae/haerae_sn.yaml |multiple_choice |\n|headqa_en |lm_eval/tasks/headqa/headqa_en.yaml |multiple_choice |\n|headqa_es |lm_eval/tasks/headqa/headqa_es.yaml |multiple_choice |\n|hellaswag |lm_eval/tasks/hellaswag/hellaswag.yaml |multiple_choice |\n|hellaswag_ar |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_ar.yaml |multiple_choice |\n|hellaswag_bn |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_bn.yaml |multiple_choice |\n|hellaswag_ca |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_ca.yaml |multiple_choice |\n|hellaswag_da |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_da.yaml |multiple_choice |\n|hellaswag_de |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_de.yaml |multiple_choice |\n|hellaswag_es |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_es.yaml |multiple_choice |\n|hellaswag_eu |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_eu.yaml |multiple_choice |\n|hellaswag_fr |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_fr.yaml |multiple_choice |\n|hellaswag_gu |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_gu.yaml |multiple_choice |\n|hellaswag_hi |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_hi.yaml |multiple_choice |\n|hellaswag_hr |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_hr.yaml |multiple_choice |\n|hellaswag_hu |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_hu.yaml |multiple_choice |\n|hellaswag_hy |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_hy.yaml |multiple_choice |\n|hellaswag_id |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_id.yaml |multiple_choice |\n|hellaswag_it |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_it.yaml |multiple_choice |\n|hellaswag_kn |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_kn.yaml |multiple_choice |\n|hellaswag_ml |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_ml.yaml |multiple_choice |\n|hellaswag_mr |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_mr.yaml |multiple_choice |\n|hellaswag_ne |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_ne.yaml |multiple_choice |\n|hellaswag_nl |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_nl.yaml |multiple_choice |\n|hellaswag_pt |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_pt.yaml |multiple_choice |\n|hellaswag_ro |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_ro.yaml |multiple_choice |\n|hellaswag_ru |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_ru.yaml |multiple_choice |\n|hellaswag_sk |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_sk.yaml |multiple_choice |\n|hellaswag_sr |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_sr.yaml |multiple_choice |\n|hellaswag_sv |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_sv.yaml |multiple_choice |\n|hellaswag_ta |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_ta.yaml |multiple_choice |\n|hellaswag_te |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_te.yaml |multiple_choice |\n|hellaswag_uk |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_uk.yaml |multiple_choice |\n|hellaswag_vi |lm_eval/tasks/okapi/hellaswag_multilingual/hellaswag_vi.yaml |multiple_choice |\n|hendrycks_math_algebra |lm_eval/tasks/hendrycks_math/hendrycks_math_algebra.yaml |generate_until |\n|hendrycks_math_counting_and_prob |lm_eval/tasks/hendrycks_math/hendrycks_math_counting_and_prob.yaml |generate_until |\n|hendrycks_math_geometry |lm_eval/tasks/hendrycks_math/hendrycks_math_geometry.yaml |generate_until |\n|hendrycks_math_intermediate_algebra |lm_eval/tasks/hendrycks_math/hendrycks_math_intermediate_algebra.yaml |generate_until |\n|hendrycks_math_num_theory |lm_eval/tasks/hendrycks_math/hendrycks_math_num_theory.yaml |generate_until |\n|hendrycks_math_prealgebra |lm_eval/tasks/hendrycks_math/hendrycks_math_prealgebra.yaml |generate_until |\n|hendrycks_math_precalc |lm_eval/tasks/hendrycks_math/hendrycks_math_precalc.yaml |generate_until |\n|ifeval |lm_eval/tasks/ifeval/ifeval.yaml |generate_until |\n|inverse_scaling_hindsight_neglect_10shot |lm_eval/tasks/inverse_scaling/inverse_scaling_hindsight_neglect.yaml |multiple_choice |\n|inverse_scaling_into_the_unknown |lm_eval/tasks/inverse_scaling/inverse_scaling_into_the_unknown.yaml |multiple_choice |\n|inverse_scaling_memo_trap |lm_eval/tasks/inverse_scaling/inverse_scaling_memo_trap.yaml |multiple_choice |\n|inverse_scaling_modus_tollens |lm_eval/tasks/inverse_scaling/inverse_scaling_modus_tollens.yaml |multiple_choice |\n|inverse_scaling_neqa |lm_eval/tasks/inverse_scaling/inverse_scaling_neqa.yaml |multiple_choice |\n|inverse_scaling_pattern_matching_suppression |lm_eval/tasks/inverse_scaling/inverse_scaling_pattern_matching_suppression.yaml |multiple_choice |\n|inverse_scaling_quote_repetition |lm_eval/tasks/inverse_scaling/inverse_scaling_quote_repetition.yaml |multiple_choice |\n|inverse_scaling_redefine_math |lm_eval/tasks/inverse_scaling/inverse_scaling_redefine_math.yaml |multiple_choice |\n|inverse_scaling_repetitive_algebra |lm_eval/tasks/inverse_scaling/inverse_scaling_repetitive_algebra.yaml |multiple_choice |\n|inverse_scaling_sig_figs |lm_eval/tasks/inverse_scaling/inverse_scaling_sig_figs.yaml |multiple_choice |\n|inverse_scaling_winobias_antistereotype |lm_eval/tasks/inverse_scaling/inverse_scaling_winobias_antistereotype.yaml |multiple_choice |\n|iwslt2017-ar-en |lm_eval/tasks/translation/iwslt2017_ar-en.yaml |generate_until |\n|iwslt2017-en-ar |lm_eval/tasks/translation/iwslt2017_en-ar.yaml |generate_until |\n|kmmlu_direct_accounting |lm_eval/tasks/kmmlu/direct/kmmlu_direct_accounting.yaml |generate_until |\n|kmmlu_direct_agricultural_sciences |lm_eval/tasks/kmmlu/direct/kmmlu_direct_agricultural_sciences.yaml |generate_until |\n|kmmlu_direct_aviation_engineering_and_maintenance |lm_eval/tasks/kmmlu/direct/kmmlu_direct_aviation_engineering_and_maintenance.yaml |generate_until |\n|kmmlu_direct_biology |lm_eval/tasks/kmmlu/direct/kmmlu_direct_biology.yaml |generate_until |\n|kmmlu_direct_chemical_engineering |lm_eval/tasks/kmmlu/direct/kmmlu_direct_chemical_engineering.yaml |generate_until |\n|kmmlu_direct_chemistry |lm_eval/tasks/kmmlu/direct/kmmlu_direct_chemistry.yaml |generate_until |\n|kmmlu_direct_civil_engineering |lm_eval/tasks/kmmlu/direct/kmmlu_direct_civil_engineering.yaml |generate_until |\n|kmmlu_direct_computer_science |lm_eval/tasks/kmmlu/direct/kmmlu_direct_computer_science.yaml |generate_until |\n|kmmlu_direct_construction |lm_eval/tasks/kmmlu/direct/kmmlu_direct_construction.yaml |generate_until |\n|kmmlu_direct_criminal_law |lm_eval/tasks/kmmlu/direct/kmmlu_direct_criminal_law.yaml |generate_until |\n|kmmlu_direct_ecology |lm_eval/tasks/kmmlu/direct/kmmlu_direct_ecology.yaml |generate_until |\n|kmmlu_direct_economics |lm_eval/tasks/kmmlu/direct/kmmlu_direct_economics.yaml |generate_until |\n|kmmlu_direct_education |lm_eval/tasks/kmmlu/direct/kmmlu_direct_education.yaml |generate_until |\n|kmmlu_direct_electrical_engineering |lm_eval/tasks/kmmlu/direct/kmmlu_direct_electrical_engineering.yaml |generate_until |\n|kmmlu_direct_electronics_engineering |lm_eval/tasks/kmmlu/direct/kmmlu_direct_electronics_engineering.yaml |generate_until |\n|kmmlu_direct_energy_management |lm_eval/tasks/kmmlu/direct/kmmlu_direct_energy_management.yaml |generate_until |\n|kmmlu_direct_environmental_science |lm_eval/tasks/kmmlu/direct/kmmlu_direct_environmental_science.yaml |generate_until |\n|kmmlu_direct_fashion |lm_eval/tasks/kmmlu/direct/kmmlu_direct_fashion.yaml |generate_until |\n|kmmlu_direct_food_processing |lm_eval/tasks/kmmlu/direct/kmmlu_direct_food_processing.yaml |generate_until |\n|kmmlu_direct_gas_technology_and_engineering |lm_eval/tasks/kmmlu/direct/kmmlu_direct_gas_technology_and_engineering.yaml |generate_until |\n|kmmlu_direct_geomatics |lm_eval/tasks/kmmlu/direct/kmmlu_direct_geomatics.yaml |generate_until |\n|kmmlu_direct_health |lm_eval/tasks/kmmlu/direct/kmmlu_direct_health.yaml |generate_until |\n|kmmlu_direct_industrial_engineer |lm_eval/tasks/kmmlu/direct/kmmlu_direct_industrial_engineer.yaml |generate_until |\n|kmmlu_direct_information_technology |lm_eval/tasks/kmmlu/direct/kmmlu_direct_information_technology.yaml |generate_until |\n|kmmlu_direct_interior_architecture_and_design |lm_eval/tasks/kmmlu/direct/kmmlu_direct_interior_architecture_and_design.yaml |generate_until |\n|kmmlu_direct_korean_history |lm_eval/tasks/kmmlu/direct/kmmlu_direct_korean_history.yaml |generate_until |\n|kmmlu_direct_law |lm_eval/tasks/kmmlu/direct/kmmlu_direct_law.yaml |generate_until |\n|kmmlu_direct_machine_design_and_manufacturing |lm_eval/tasks/kmmlu/direct/kmmlu_direct_machine_design_and_manufacturing.yaml |generate_until |\n|kmmlu_direct_management |lm_eval/tasks/kmmlu/direct/kmmlu_direct_management.yaml |generate_until |\n|kmmlu_direct_maritime_engineering |lm_eval/tasks/kmmlu/direct/kmmlu_direct_maritime_engineering.yaml |generate_until |\n|kmmlu_direct_marketing |lm_eval/tasks/kmmlu/direct/kmmlu_direct_marketing.yaml |generate_until |\n|kmmlu_direct_materials_engineering |lm_eval/tasks/kmmlu/direct/kmmlu_direct_materials_engineering.yaml |generate_until |\n|kmmlu_direct_math |lm_eval/tasks/kmmlu/direct/kmmlu_direct_math.yaml |generate_until |\n|kmmlu_direct_mechanical_engineering |lm_eval/tasks/kmmlu/direct/kmmlu_direct_mechanical_engineering.yaml |generate_until |\n|kmmlu_direct_nondestructive_testing |lm_eval/tasks/kmmlu/direct/kmmlu_direct_nondestructive_testing.yaml |generate_until |\n|kmmlu_direct_patent |lm_eval/tasks/kmmlu/direct/kmmlu_direct_patent.yaml |generate_until |\n|kmmlu_direct_political_science_and_sociology |lm_eval/tasks/kmmlu/direct/kmmlu_direct_political_science_and_sociology.yaml |generate_until |\n|kmmlu_direct_psychology |lm_eval/tasks/kmmlu/direct/kmmlu_direct_psychology.yaml |generate_until |\n|kmmlu_direct_public_safety |lm_eval/tasks/kmmlu/direct/kmmlu_direct_public_safety.yaml |generate_until |\n|kmmlu_direct_railway_and_automotive_engineering |lm_eval/tasks/kmmlu/direct/kmmlu_direct_railway_and_automotive_engineering.yaml |generate_until |\n|kmmlu_direct_real_estate |lm_eval/tasks/kmmlu/direct/kmmlu_direct_real_estate.yaml |generate_until |\n|kmmlu_direct_refrigerating_machinery |lm_eval/tasks/kmmlu/direct/kmmlu_direct_refrigerating_machinery.yaml |generate_until |\n|kmmlu_direct_social_welfare |lm_eval/tasks/kmmlu/direct/kmmlu_direct_social_welfare.yaml |generate_until |\n|kmmlu_direct_taxation |lm_eval/tasks/kmmlu/direct/kmmlu_direct_taxation.yaml |generate_until |\n|kmmlu_direct_telecommunications_and_wireless_technology |lm_eval/tasks/kmmlu/direct/kmmlu_direct_telecommunications_and_wireless_technology.yaml |generate_until |\n|kmmlu_hard_accounting |lm_eval/tasks/kmmlu/hard/kmmlu_hard_accounting.yaml |multiple_choice |\n|kmmlu_hard_agricultural_sciences |lm_eval/tasks/kmmlu/hard/kmmlu_hard_agricultural_sciences.yaml |multiple_choice |\n|kmmlu_hard_aviation_engineering_and_maintenance |lm_eval/tasks/kmmlu/hard/kmmlu_hard_aviation_engineering_and_maintenance.yaml |multiple_choice |\n|kmmlu_hard_biology |lm_eval/tasks/kmmlu/hard/kmmlu_hard_biology.yaml |multiple_choice |\n|kmmlu_hard_chemical_engineering |lm_eval/tasks/kmmlu/hard/kmmlu_hard_chemical_engineering.yaml |multiple_choice |\n|kmmlu_hard_chemistry |lm_eval/tasks/kmmlu/hard/kmmlu_hard_chemistry.yaml |multiple_choice |\n|kmmlu_hard_civil_engineering |lm_eval/tasks/kmmlu/hard/kmmlu_hard_civil_engineering.yaml |multiple_choice |\n|kmmlu_hard_computer_science |lm_eval/tasks/kmmlu/hard/kmmlu_hard_computer_science.yaml |multiple_choice |\n|kmmlu_hard_construction |lm_eval/tasks/kmmlu/hard/kmmlu_hard_construction.yaml |multiple_choice |\n|kmmlu_hard_cot_accounting |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_accounting.yaml |generate_until |\n|kmmlu_hard_cot_agricultural_sciences |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_agricultural_sciences.yaml |generate_until |\n|kmmlu_hard_cot_aviation_engineering_and_maintenance |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_aviation_engineering_and_maintenance.yaml |generate_until |\n|kmmlu_hard_cot_biology |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_biology.yaml |generate_until |\n|kmmlu_hard_cot_chemical_engineering |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_chemical_engineering.yaml |generate_until |\n|kmmlu_hard_cot_chemistry |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_chemistry.yaml |generate_until |\n|kmmlu_hard_cot_civil_engineering |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_civil_engineering.yaml |generate_until |\n|kmmlu_hard_cot_computer_science |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_computer_science.yaml |generate_until |\n|kmmlu_hard_cot_construction |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_construction.yaml |generate_until |\n|kmmlu_hard_cot_criminal_law |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_criminal_law.yaml |generate_until |\n|kmmlu_hard_cot_ecology |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_ecology.yaml |generate_until |\n|kmmlu_hard_cot_economics |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_economics.yaml |generate_until |\n|kmmlu_hard_cot_education |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_education.yaml |generate_until |\n|kmmlu_hard_cot_electrical_engineering |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_electrical_engineering.yaml |generate_until |\n|kmmlu_hard_cot_electronics_engineering |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_electronics_engineering.yaml |generate_until |\n|kmmlu_hard_cot_energy_management |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_energy_management.yaml |generate_until |\n|kmmlu_hard_cot_environmental_science |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_environmental_science.yaml |generate_until |\n|kmmlu_hard_cot_fashion |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_fashion.yaml |generate_until |\n|kmmlu_hard_cot_food_processing |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_food_processing.yaml |generate_until |\n|kmmlu_hard_cot_gas_technology_and_engineering |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_gas_technology_and_engineering.yaml |generate_until |\n|kmmlu_hard_cot_geomatics |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_geomatics.yaml |generate_until |\n|kmmlu_hard_cot_health |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_health.yaml |generate_until |\n|kmmlu_hard_cot_industrial_engineer |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_industrial_engineer.yaml |generate_until |\n|kmmlu_hard_cot_information_technology |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_information_technology.yaml |generate_until |\n|kmmlu_hard_cot_interior_architecture_and_design |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_interior_architecture_and_design.yaml |generate_until |\n|kmmlu_hard_cot_korean_history |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_korean_history.yaml |generate_until |\n|kmmlu_hard_cot_law |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_law.yaml |generate_until |\n|kmmlu_hard_cot_machine_design_and_manufacturing |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_machine_design_and_manufacturing.yaml |generate_until |\n|kmmlu_hard_cot_management |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_management.yaml |generate_until |\n|kmmlu_hard_cot_maritime_engineering |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_maritime_engineering.yaml |generate_until |\n|kmmlu_hard_cot_marketing |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_marketing.yaml |generate_until |\n|kmmlu_hard_cot_materials_engineering |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_materials_engineering.yaml |generate_until |\n|kmmlu_hard_cot_math |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_math.yaml |generate_until |\n|kmmlu_hard_cot_mechanical_engineering |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_mechanical_engineering.yaml |generate_until |\n|kmmlu_hard_cot_nondestructive_testing |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_nondestructive_testing.yaml |generate_until |\n|kmmlu_hard_cot_patent |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_patent.yaml |generate_until |\n|kmmlu_hard_cot_political_science_and_sociology |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_political_science_and_sociology.yaml |generate_until |\n|kmmlu_hard_cot_psychology |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_psychology.yaml |generate_until |\n|kmmlu_hard_cot_public_safety |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_public_safety.yaml |generate_until |\n|kmmlu_hard_cot_railway_and_automotive_engineering |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_railway_and_automotive_engineering.yaml |generate_until |\n|kmmlu_hard_cot_real_estate |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_real_estate.yaml |generate_until |\n|kmmlu_hard_cot_refrigerating_machinery |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_refrigerating_machinery.yaml |generate_until |\n|kmmlu_hard_cot_social_welfare |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_social_welfare.yaml |generate_until |\n|kmmlu_hard_cot_taxation |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_taxation.yaml |generate_until |\n|kmmlu_hard_cot_telecommunications_and_wireless_technology |lm_eval/tasks/kmmlu/cot_hard/kmmlu_cot_hard_telecommunications_and_wireless_technology.yaml |generate_until |\n|kmmlu_hard_criminal_law |lm_eval/tasks/kmmlu/hard/kmmlu_hard_criminal_law.yaml |multiple_choice |\n|kmmlu_hard_direct_accounting |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_accounting.yaml |generate_until |\n|kmmlu_hard_direct_agricultural_sciences |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_agricultural_sciences.yaml |generate_until |\n|kmmlu_hard_direct_aviation_engineering_and_maintenance |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_aviation_engineering_and_maintenance.yaml |generate_until |\n|kmmlu_hard_direct_biology |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_biology.yaml |generate_until |\n|kmmlu_hard_direct_chemical_engineering |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_chemical_engineering.yaml |generate_until |\n|kmmlu_hard_direct_chemistry |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_chemistry.yaml |generate_until |\n|kmmlu_hard_direct_civil_engineering |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_civil_engineering.yaml |generate_until |\n|kmmlu_hard_direct_computer_science |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_computer_science.yaml |generate_until |\n|kmmlu_hard_direct_construction |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_construction.yaml |generate_until |\n|kmmlu_hard_direct_criminal_law |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_criminal_law.yaml |generate_until |\n|kmmlu_hard_direct_ecology |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_ecology.yaml |generate_until |\n|kmmlu_hard_direct_economics |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_economics.yaml |generate_until |\n|kmmlu_hard_direct_education |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_education.yaml |generate_until |\n|kmmlu_hard_direct_electrical_engineering |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_electrical_engineering.yaml |generate_until |\n|kmmlu_hard_direct_electronics_engineering |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_electronics_engineering.yaml |generate_until |\n|kmmlu_hard_direct_energy_management |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_energy_management.yaml |generate_until |\n|kmmlu_hard_direct_environmental_science |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_environmental_science.yaml |generate_until |\n|kmmlu_hard_direct_fashion |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_fashion.yaml |generate_until |\n|kmmlu_hard_direct_food_processing |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_food_processing.yaml |generate_until |\n|kmmlu_hard_direct_gas_technology_and_engineering |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_gas_technology_and_engineering.yaml |generate_until |\n|kmmlu_hard_direct_geomatics |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_geomatics.yaml |generate_until |\n|kmmlu_hard_direct_health |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_health.yaml |generate_until |\n|kmmlu_hard_direct_industrial_engineer |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_industrial_engineer.yaml |generate_until |\n|kmmlu_hard_direct_information_technology |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_information_technology.yaml |generate_until |\n|kmmlu_hard_direct_interior_architecture_and_design |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_interior_architecture_and_design.yaml |generate_until |\n|kmmlu_hard_direct_korean_history |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_korean_history.yaml |generate_until |\n|kmmlu_hard_direct_law |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_law.yaml |generate_until |\n|kmmlu_hard_direct_machine_design_and_manufacturing |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_machine_design_and_manufacturing.yaml |generate_until |\n|kmmlu_hard_direct_management |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_management.yaml |generate_until |\n|kmmlu_hard_direct_maritime_engineering |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_maritime_engineering.yaml |generate_until |\n|kmmlu_hard_direct_marketing |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_marketing.yaml |generate_until |\n|kmmlu_hard_direct_materials_engineering |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_materials_engineering.yaml |generate_until |\n|kmmlu_hard_direct_math |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_math.yaml |generate_until |\n|kmmlu_hard_direct_mechanical_engineering |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_mechanical_engineering.yaml |generate_until |\n|kmmlu_hard_direct_nondestructive_testing |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_nondestructive_testing.yaml |generate_until |\n|kmmlu_hard_direct_patent |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_patent.yaml |generate_until |\n|kmmlu_hard_direct_political_science_and_sociology |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_political_science_and_sociology.yaml |generate_until |\n|kmmlu_hard_direct_psychology |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_psychology.yaml |generate_until |\n|kmmlu_hard_direct_public_safety |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_public_safety.yaml |generate_until |\n|kmmlu_hard_direct_railway_and_automotive_engineering |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_railway_and_automotive_engineering.yaml |generate_until |\n|kmmlu_hard_direct_real_estate |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_real_estate.yaml |generate_until |\n|kmmlu_hard_direct_refrigerating_machinery |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_refrigerating_machinery.yaml |generate_until |\n|kmmlu_hard_direct_social_welfare |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_social_welfare.yaml |generate_until |\n|kmmlu_hard_direct_taxation |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_taxation.yaml |generate_until |\n|kmmlu_hard_direct_telecommunications_and_wireless_technology |lm_eval/tasks/kmmlu/direct_hard/kmmlu_direct_hard_telecommunications_and_wireless_technology.yaml |generate_until |\n|kmmlu_hard_ecology |lm_eval/tasks/kmmlu/hard/kmmlu_hard_ecology.yaml |multiple_choice |\n|kmmlu_hard_economics |lm_eval/tasks/kmmlu/hard/kmmlu_hard_economics.yaml |multiple_choice |\n|kmmlu_hard_education |lm_eval/tasks/kmmlu/hard/kmmlu_hard_education.yaml |multiple_choice |\n|kmmlu_hard_electrical_engineering |lm_eval/tasks/kmmlu/hard/kmmlu_hard_electrical_engineering.yaml |multiple_choice |\n|kmmlu_hard_electronics_engineering |lm_eval/tasks/kmmlu/hard/kmmlu_hard_electronics_engineering.yaml |multiple_choice |\n|kmmlu_hard_energy_management |lm_eval/tasks/kmmlu/hard/kmmlu_hard_energy_management.yaml |multiple_choice |\n|kmmlu_hard_environmental_science |lm_eval/tasks/kmmlu/hard/kmmlu_hard_environmental_science.yaml |multiple_choice |\n|kmmlu_hard_fashion |lm_eval/tasks/kmmlu/hard/kmmlu_hard_fashion.yaml |multiple_choice |\n|kmmlu_hard_food_processing |lm_eval/tasks/kmmlu/hard/kmmlu_hard_food_processing.yaml |multiple_choice |\n|kmmlu_hard_gas_technology_and_engineering |lm_eval/tasks/kmmlu/hard/kmmlu_hard_gas_technology_and_engineering.yaml |multiple_choice |\n|kmmlu_hard_geomatics |lm_eval/tasks/kmmlu/hard/kmmlu_hard_geomatics.yaml |multiple_choice |\n|kmmlu_hard_health |lm_eval/tasks/kmmlu/hard/kmmlu_hard_health.yaml |multiple_choice |\n|kmmlu_hard_industrial_engineer |lm_eval/tasks/kmmlu/hard/kmmlu_hard_industrial_engineer.yaml |multiple_choice |\n|kmmlu_hard_information_technology |lm_eval/tasks/kmmlu/hard/kmmlu_hard_information_technology.yaml |multiple_choice |\n|kmmlu_hard_interior_architecture_and_design |lm_eval/tasks/kmmlu/hard/kmmlu_hard_interior_architecture_and_design.yaml |multiple_choice |\n|kmmlu_hard_korean_history |lm_eval/tasks/kmmlu/hard/kmmlu_hard_korean_history.yaml |multiple_choice |\n|kmmlu_hard_law |lm_eval/tasks/kmmlu/hard/kmmlu_hard_law.yaml |multiple_choice |\n|kmmlu_hard_machine_design_and_manufacturing |lm_eval/tasks/kmmlu/hard/kmmlu_hard_machine_design_and_manufacturing.yaml |multiple_choice |\n|kmmlu_hard_management |lm_eval/tasks/kmmlu/hard/kmmlu_hard_management.yaml |multiple_choice |\n|kmmlu_hard_maritime_engineering |lm_eval/tasks/kmmlu/hard/kmmlu_hard_maritime_engineering.yaml |multiple_choice |\n|kmmlu_hard_marketing |lm_eval/tasks/kmmlu/hard/kmmlu_hard_marketing.yaml |multiple_choice |\n|kmmlu_hard_materials_engineering |lm_eval/tasks/kmmlu/hard/kmmlu_hard_materials_engineering.yaml |multiple_choice |\n|kmmlu_hard_math |lm_eval/tasks/kmmlu/hard/kmmlu_hard_math.yaml |multiple_choice |\n|kmmlu_hard_mechanical_engineering |lm_eval/tasks/kmmlu/hard/kmmlu_hard_mechanical_engineering.yaml |multiple_choice |\n|kmmlu_hard_nondestructive_testing |lm_eval/tasks/kmmlu/hard/kmmlu_hard_nondestructive_testing.yaml |multiple_choice |\n|kmmlu_hard_patent |lm_eval/tasks/kmmlu/hard/kmmlu_hard_patent.yaml |multiple_choice |\n|kmmlu_hard_political_science_and_sociology |lm_eval/tasks/kmmlu/hard/kmmlu_hard_political_science_and_sociology.yaml |multiple_choice |\n|kmmlu_hard_psychology |lm_eval/tasks/kmmlu/hard/kmmlu_hard_psychology.yaml |multiple_choice |\n|kmmlu_hard_public_safety |lm_eval/tasks/kmmlu/hard/kmmlu_hard_public_safety.yaml |multiple_choice |\n|kmmlu_hard_railway_and_automotive_engineering |lm_eval/tasks/kmmlu/hard/kmmlu_hard_railway_and_automotive_engineering.yaml |multiple_choice |\n|kmmlu_hard_real_estate |lm_eval/tasks/kmmlu/hard/kmmlu_hard_real_estate.yaml |multiple_choice |\n|kmmlu_hard_refrigerating_machinery |lm_eval/tasks/kmmlu/hard/kmmlu_hard_refrigerating_machinery.yaml |multiple_choice |\n|kmmlu_hard_social_welfare |lm_eval/tasks/kmmlu/hard/kmmlu_hard_social_welfare.yaml |multiple_choice |\n|kmmlu_hard_taxation |lm_eval/tasks/kmmlu/hard/kmmlu_hard_taxation.yaml |multiple_choice |\n|kmmlu_hard_telecommunications_and_wireless_technology |lm_eval/tasks/kmmlu/hard/kmmlu_hard_telecommunications_and_wireless_technology.yaml |multiple_choice |\n|kobest_boolq |lm_eval/tasks/kobest/kobest_boolq.yaml |multiple_choice |\n|kobest_copa |lm_eval/tasks/kobest/kobest_copa.yaml |multiple_choice |\n|kobest_hellaswag |lm_eval/tasks/kobest/kobest_hellaswag.yaml |multiple_choice |\n|kobest_sentineg |lm_eval/tasks/kobest/kobest_sentineg.yaml |multiple_choice |\n|kobest_wic |lm_eval/tasks/kobest/kobest_wic.yaml |multiple_choice |\n|kormedmcqa_doctor |lm_eval/tasks/kormedmcqa/kormedmcqa_doctor.yaml |generate_until |\n|kormedmcqa_nurse |lm_eval/tasks/kormedmcqa/kormedmcqa_nurse.yaml |generate_until |\n|kormedmcqa_pharm |lm_eval/tasks/kormedmcqa/kormedmcqa_pharm.yaml |generate_until |\n|lambada_openai |lm_eval/tasks/lambada/lambada_openai.yaml |loglikelihood |\n|lambada_openai_cloze_yaml |lm_eval/tasks/lambada_cloze/lambada_openai_cloze.yaml |loglikelihood |\n|lambada_openai_mt_de |lm_eval/tasks/lambada_multilingual/lambada_mt_de.yaml |loglikelihood |\n|lambada_openai_mt_en |lm_eval/tasks/lambada_multilingual/lambada_mt_en.yaml |loglikelihood |\n|lambada_openai_mt_es |lm_eval/tasks/lambada_multilingual/lambada_mt_es.yaml |loglikelihood |\n|lambada_openai_mt_fr |lm_eval/tasks/lambada_multilingual/lambada_mt_fr.yaml |loglikelihood |\n|lambada_openai_mt_it |lm_eval/tasks/lambada_multilingual/lambada_mt_it.yaml |loglikelihood |\n|lambada_openai_mt_stablelm_de |lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_de.yaml |loglikelihood |\n|lambada_openai_mt_stablelm_en |lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_en.yaml |loglikelihood |\n|lambada_openai_mt_stablelm_es |lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_es.yaml |loglikelihood |\n|lambada_openai_mt_stablelm_fr |lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_fr.yaml |loglikelihood |\n|lambada_openai_mt_stablelm_it |lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_it.yaml |loglikelihood |\n|lambada_openai_mt_stablelm_nl |lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_nl.yaml |loglikelihood |\n|lambada_openai_mt_stablelm_pt |lm_eval/tasks/lambada_multilingual_stablelm/lambada_mt_stablelm_pt.yaml |loglikelihood |\n|lambada_standard |lm_eval/tasks/lambada/lambada_standard.yaml |loglikelihood |\n|lambada_standard_cloze_yaml |lm_eval/tasks/lambada_cloze/lambada_standard_cloze.yaml |loglikelihood |\n|leaderboard_bbh_boolean_expressions |lm_eval/tasks/leaderboard/bbh_mc/boolean_expressions.yaml |multiple_choice |\n|leaderboard_bbh_causal_judgement |lm_eval/tasks/leaderboard/bbh_mc/causal_judgement.yaml |multiple_choice |\n|leaderboard_bbh_date_understanding |lm_eval/tasks/leaderboard/bbh_mc/date_understanding.yaml |multiple_choice |\n|leaderboard_bbh_disambiguation_qa |lm_eval/tasks/leaderboard/bbh_mc/disambiguation_qa.yaml |multiple_choice |\n|leaderboard_bbh_formal_fallacies |lm_eval/tasks/leaderboard/bbh_mc/formal_fallacies.yaml |multiple_choice |\n|leaderboard_bbh_geometric_shapes |lm_eval/tasks/leaderboard/bbh_mc/geometric_shapes.yaml |multiple_choice |\n|leaderboard_bbh_hyperbaton |lm_eval/tasks/leaderboard/bbh_mc/hyperbaton.yaml |multiple_choice |\n|leaderboard_bbh_logical_deduction_five_objects |lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_five_objects.yaml |multiple_choice |\n|leaderboard_bbh_logical_deduction_seven_objects |lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_seven_objects.yaml |multiple_choice |\n|leaderboard_bbh_logical_deduction_three_objects |lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_three_objects.yaml |multiple_choice |\n|leaderboard_bbh_movie_recommendation |lm_eval/tasks/leaderboard/bbh_mc/movie_recommendation.yaml |multiple_choice |\n|leaderboard_bbh_navigate |lm_eval/tasks/leaderboard/bbh_mc/navigate.yaml |multiple_choice |\n|leaderboard_bbh_object_counting |lm_eval/tasks/leaderboard/bbh_mc/object_counting.yaml |multiple_choice |\n|leaderboard_bbh_penguins_in_a_table |lm_eval/tasks/leaderboard/bbh_mc/penguins_in_a_table.yaml |multiple_choice |\n|leaderboard_bbh_reasoning_about_colored_objects |lm_eval/tasks/leaderboard/bbh_mc/reasoning_about_colored_objects.yaml |multiple_choice |\n|leaderboard_bbh_ruin_names |lm_eval/tasks/leaderboard/bbh_mc/ruin_names.yaml |multiple_choice |\n|leaderboard_bbh_salient_translation_error_detection |lm_eval/tasks/leaderboard/bbh_mc/salient_translation_error_detection.yaml |multiple_choice |\n|leaderboard_bbh_snarks |lm_eval/tasks/leaderboard/bbh_mc/snarks.yaml |multiple_choice |\n|leaderboard_bbh_sports_understanding |lm_eval/tasks/leaderboard/bbh_mc/sports_understanding.yaml |multiple_choice |\n|leaderboard_bbh_temporal_sequences |lm_eval/tasks/leaderboard/bbh_mc/temporal_sequences.yaml |multiple_choice |\n|leaderboard_bbh_tracking_shuffled_objects_five_objects |lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_five_objects.yaml |multiple_choice |\n|leaderboard_bbh_tracking_shuffled_objects_seven_objects |lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_seven_objects.yaml |multiple_choice |\n|leaderboard_bbh_tracking_shuffled_objects_three_objects |lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_three_objects.yaml |multiple_choice |\n|leaderboard_bbh_web_of_lies |lm_eval/tasks/leaderboard/bbh_mc/web_of_lies.yaml |multiple_choice |\n|leaderboard_gpqa_diamond |lm_eval/tasks/leaderboard/gpqa/gpqa_diamond_zeroshot.yaml |multiple_choice |\n|leaderboard_gpqa_extended |lm_eval/tasks/leaderboard/gpqa/gpqa_extended_zeroshot.yaml |multiple_choice |\n|leaderboard_gpqa_main |lm_eval/tasks/leaderboard/gpqa/gpqa_main_zeroshot.yaml |multiple_choice |\n|leaderboard_ifeval |lm_eval/tasks/leaderboard/ifeval/ifeval.yaml |generate_until |\n|leaderboard_math_algebra_hard |lm_eval/tasks/leaderboard/math/math_algebra.yaml |generate_until |\n|leaderboard_math_counting_and_prob_hard |lm_eval/tasks/leaderboard/math/math_counting_and_prob.yaml |generate_until |\n|leaderboard_math_geometry_hard |lm_eval/tasks/leaderboard/math/math_geometry.yaml |generate_until |\n|leaderboard_math_intermediate_algebra_hard |lm_eval/tasks/leaderboard/math/math_intermediate_algebra.yaml |generate_until |\n|leaderboard_math_num_theory_hard |lm_eval/tasks/leaderboard/math/math_num_theory.yaml |generate_until |\n|leaderboard_math_prealgebra_hard |lm_eval/tasks/leaderboard/math/math_prealgebra.yaml |generate_until |\n|leaderboard_math_precalculus_hard |lm_eval/tasks/leaderboard/math/math_precalculus.yaml |generate_until |\n|leaderboard_mmlu_pro |lm_eval/tasks/leaderboard/mmlu_pro/mmlu_pro.yaml |multiple_choice |\n|leaderboard_musr_murder_mysteries |lm_eval/tasks/leaderboard/musr/musr_murder_mysteries.yaml |multiple_choice |\n|leaderboard_musr_object_placements |lm_eval/tasks/leaderboard/musr/musr_object_placements.yaml |multiple_choice |\n|leaderboard_musr_team_allocation |lm_eval/tasks/leaderboard/musr/musr_team_allocation.yaml |multiple_choice |\n|lingoly_context |lm_eval/tasks/lingoly/lingoly_context.yaml | |\n|lingoly_nocontext |lm_eval/tasks/lingoly/lingoly_nocontext.yaml | |\n|logieval |lm_eval/tasks/logiqa2/logieval.yaml |generate_until |\n|logiqa |lm_eval/tasks/logiqa/logiqa.yaml |multiple_choice |\n|logiqa2 |lm_eval/tasks/logiqa2/logiqa2.yaml |multiple_choice |\n|m_mmlu_ar |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_ar.yaml |multiple_choice |\n|m_mmlu_bn |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_bn.yaml |multiple_choice |\n|m_mmlu_ca |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_ca.yaml |multiple_choice |\n|m_mmlu_da |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_da.yaml |multiple_choice |\n|m_mmlu_de |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_de.yaml |multiple_choice |\n|m_mmlu_en |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_en.yaml |multiple_choice |\n|m_mmlu_es |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_es.yaml |multiple_choice |\n|m_mmlu_eu |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_eu.yaml |multiple_choice |\n|m_mmlu_fr |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_fr.yaml |multiple_choice |\n|m_mmlu_gu |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_gu.yaml |multiple_choice |\n|m_mmlu_hi |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_hi.yaml |multiple_choice |\n|m_mmlu_hr |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_hr.yaml |multiple_choice |\n|m_mmlu_hu |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_hu.yaml |multiple_choice |\n|m_mmlu_hy |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_hy.yaml |multiple_choice |\n|m_mmlu_id |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_id.yaml |multiple_choice |\n|m_mmlu_is |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_is.yaml |multiple_choice |\n|m_mmlu_it |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_it.yaml |multiple_choice |\n|m_mmlu_kn |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_kn.yaml |multiple_choice |\n|m_mmlu_ml |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_ml.yaml |multiple_choice |\n|m_mmlu_mr |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_mr.yaml |multiple_choice |\n|m_mmlu_nb |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_nb.yaml |multiple_choice |\n|m_mmlu_ne |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_ne.yaml |multiple_choice |\n|m_mmlu_nl |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_nl.yaml |multiple_choice |\n|m_mmlu_pt |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_pt.yaml |multiple_choice |\n|m_mmlu_ro |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_ro.yaml |multiple_choice |\n|m_mmlu_ru |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_ru.yaml |multiple_choice |\n|m_mmlu_sk |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_sk.yaml |multiple_choice |\n|m_mmlu_sr |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_sr.yaml |multiple_choice |\n|m_mmlu_sv |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_sv.yaml |multiple_choice |\n|m_mmlu_ta |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_ta.yaml |multiple_choice |\n|m_mmlu_te |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_te.yaml |multiple_choice |\n|m_mmlu_uk |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_uk.yaml |multiple_choice |\n|m_mmlu_vi |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_vi.yaml |multiple_choice |\n|m_mmlu_zh |lm_eval/tasks/okapi/mmlu_multilingual/m_mmlu_zh.yaml |multiple_choice |\n|mathqa |lm_eval/tasks/mathqa/mathqa.yaml |multiple_choice |\n|mc_taco |lm_eval/tasks/mc_taco/default.yaml |multiple_choice |\n|med_concepts_qa_atc_easy |lm_eval/tasks/med_concepts_qa/med_concepts_qa_atc_easy.yaml |multiple_choice |\n|med_concepts_qa_atc_hard |lm_eval/tasks/med_concepts_qa/med_concepts_qa_atc_hard.yaml |multiple_choice |\n|med_concepts_qa_atc_medium |lm_eval/tasks/med_concepts_qa/med_concepts_qa_atc_medium.yaml |multiple_choice |\n|med_concepts_qa_icd10cm_easy |lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10cm_easy.yaml |multiple_choice |\n|med_concepts_qa_icd10cm_hard |lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10cm_hard.yaml |multiple_choice |\n|med_concepts_qa_icd10cm_medium |lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10cm_medium.yaml |multiple_choice |\n|med_concepts_qa_icd10proc_easy |lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10proc_easy.yaml |multiple_choice |\n|med_concepts_qa_icd10proc_hard |lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10proc_hard.yaml |multiple_choice |\n|med_concepts_qa_icd10proc_medium |lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd10proc_medium.yaml |multiple_choice |\n|med_concepts_qa_icd9cm_easy |lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9cm_easy.yaml |multiple_choice |\n|med_concepts_qa_icd9cm_hard |lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9cm_hard.yaml |multiple_choice |\n|med_concepts_qa_icd9cm_medium |lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9cm_medium.yaml |multiple_choice |\n|med_concepts_qa_icd9proc_easy |lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9proc_easy.yaml |multiple_choice |\n|med_concepts_qa_icd9proc_hard |lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9proc_hard.yaml |multiple_choice |\n|med_concepts_qa_icd9proc_medium |lm_eval/tasks/med_concepts_qa/med_concepts_qa_icd9proc_medium.yaml |multiple_choice |\n|medmcqa |lm_eval/tasks/medmcqa/medmcqa.yaml |multiple_choice |\n|medqa_4options |lm_eval/tasks/medqa/medqa.yaml |multiple_choice |\n|mela_ar |lm_eval/tasks/mela/mela_ar.yaml |multiple_choice |\n|mela_de |lm_eval/tasks/mela/mela_de.yaml |multiple_choice |\n|mela_en |lm_eval/tasks/mela/mela_en.yaml |multiple_choice |\n|mela_es |lm_eval/tasks/mela/mela_es.yaml |multiple_choice |\n|mela_fr |lm_eval/tasks/mela/mela_fr.yaml |multiple_choice |\n|mela_is |lm_eval/tasks/mela/mela_is.yaml |multiple_choice |\n|mela_it |lm_eval/tasks/mela/mela_it.yaml |multiple_choice |\n|mela_ja |lm_eval/tasks/mela/mela_ja.yaml |multiple_choice |\n|mela_ru |lm_eval/tasks/mela/mela_ru.yaml |multiple_choice |\n|mela_zh |lm_eval/tasks/mela/mela_zh.yaml |multiple_choice |\n|mgsm_direct_bn |lm_eval/tasks/mgsm/direct/mgsm_direct_bn.yaml |generate_until |\n|mgsm_direct_de |lm_eval/tasks/mgsm/direct/mgsm_direct_de.yaml |generate_until |\n|mgsm_direct_en |lm_eval/tasks/mgsm/direct/mgsm_direct_en.yaml |generate_until |\n|mgsm_direct_es |lm_eval/tasks/mgsm/direct/mgsm_direct_es.yaml |generate_until |\n|mgsm_direct_fr |lm_eval/tasks/mgsm/direct/mgsm_direct_fr.yaml |generate_until |\n|mgsm_direct_ja |lm_eval/tasks/mgsm/direct/mgsm_direct_ja.yaml |generate_until |\n|mgsm_direct_ru |lm_eval/tasks/mgsm/direct/mgsm_direct_ru.yaml |generate_until |\n|mgsm_direct_sw |lm_eval/tasks/mgsm/direct/mgsm_direct_sw.yaml |generate_until |\n|mgsm_direct_te |lm_eval/tasks/mgsm/direct/mgsm_direct_te.yaml |generate_until |\n|mgsm_direct_th |lm_eval/tasks/mgsm/direct/mgsm_direct_th.yaml |generate_until |\n|mgsm_direct_zh |lm_eval/tasks/mgsm/direct/mgsm_direct_zh.yaml |generate_until |\n|mgsm_en_cot_bn |lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_bn.yaml |generate_until |\n|mgsm_en_cot_de |lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_de.yaml |generate_until |\n|mgsm_en_cot_en |lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_en.yaml |generate_until |\n|mgsm_en_cot_es |lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_es.yaml |generate_until |\n|mgsm_en_cot_fr |lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_fr.yaml |generate_until |\n|mgsm_en_cot_ja |lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_ja.yaml |generate_until |\n|mgsm_en_cot_ru |lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_ru.yaml |generate_until |\n|mgsm_en_cot_sw |lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_sw.yaml |generate_until |\n|mgsm_en_cot_te |lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_te.yaml |generate_until |\n|mgsm_en_cot_th |lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_th.yaml |generate_until |\n|mgsm_en_cot_zh |lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_zh.yaml |generate_until |\n|mgsm_native_cot_bn |lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_bn.yaml |generate_until |\n|mgsm_native_cot_de |lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_de.yaml |generate_until |\n|mgsm_native_cot_en |lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_en.yaml |generate_until |\n|mgsm_native_cot_es |lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_es.yaml |generate_until |\n|mgsm_native_cot_fr |lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_fr.yaml |generate_until |\n|mgsm_native_cot_ja |lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_ja.yaml |generate_until |\n|mgsm_native_cot_ru |lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_ru.yaml |generate_until |\n|mgsm_native_cot_sw |lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_sw.yaml |generate_until |\n|mgsm_native_cot_te |lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_te.yaml |generate_until |\n|mgsm_native_cot_th |lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_th.yaml |generate_until |\n|mgsm_native_cot_zh |lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_zh.yaml |generate_until |\n|minerva_math_algebra |lm_eval/tasks/minerva_math/minerva_math_algebra.yaml |generate_until |\n|minerva_math_counting_and_prob |lm_eval/tasks/minerva_math/minerva_math_counting_and_prob.yaml |generate_until |\n|minerva_math_geometry |lm_eval/tasks/minerva_math/minerva_math_geometry.yaml |generate_until |\n|minerva_math_intermediate_algebra |lm_eval/tasks/minerva_math/minerva_math_intermediate_algebra.yaml |generate_until |\n|minerva_math_num_theory |lm_eval/tasks/minerva_math/minerva_math_num_theory.yaml |generate_until |\n|minerva_math_prealgebra |lm_eval/tasks/minerva_math/minerva_math_prealgebra.yaml |generate_until |\n|minerva_math_precalc |lm_eval/tasks/minerva_math/minerva_math_precalc.yaml |generate_until |\n|mmlu_abstract_algebra |lm_eval/tasks/mmlu/default/mmlu_abstract_algebra.yaml |multiple_choice |\n|mmlu_abstract_algebra_generative |lm_eval/tasks/mmlu/generative/mmlu_abstract_algebra.yaml |generate_until |\n|mmlu_anatomy |lm_eval/tasks/mmlu/default/mmlu_anatomy.yaml |multiple_choice |\n|mmlu_anatomy_generative |lm_eval/tasks/mmlu/generative/mmlu_anatomy.yaml |generate_until |\n|mmlu_astronomy |lm_eval/tasks/mmlu/default/mmlu_astronomy.yaml |multiple_choice |\n|mmlu_astronomy_generative |lm_eval/tasks/mmlu/generative/mmlu_astronomy.yaml |generate_until |\n|mmlu_business_ethics |lm_eval/tasks/mmlu/default/mmlu_business_ethics.yaml |multiple_choice |\n|mmlu_business_ethics_generative |lm_eval/tasks/mmlu/generative/mmlu_business_ethics.yaml |generate_until |\n|mmlu_clinical_knowledge |lm_eval/tasks/mmlu/default/mmlu_clinical_knowledge.yaml |multiple_choice |\n|mmlu_clinical_knowledge_generative |lm_eval/tasks/mmlu/generative/mmlu_clinical_knowledge.yaml |generate_until |\n|mmlu_college_biology |lm_eval/tasks/mmlu/default/mmlu_college_biology.yaml |multiple_choice |\n|mmlu_college_biology_generative |lm_eval/tasks/mmlu/generative/mmlu_college_biology.yaml |generate_until |\n|mmlu_college_chemistry |lm_eval/tasks/mmlu/default/mmlu_college_chemistry.yaml |multiple_choice |\n|mmlu_college_chemistry_generative |lm_eval/tasks/mmlu/generative/mmlu_college_chemistry.yaml |generate_until |\n|mmlu_college_computer_science |lm_eval/tasks/mmlu/default/mmlu_college_computer_science.yaml |multiple_choice |\n|mmlu_college_computer_science_generative |lm_eval/tasks/mmlu/generative/mmlu_college_computer_science.yaml |generate_until |\n|mmlu_college_mathematics |lm_eval/tasks/mmlu/default/mmlu_college_mathematics.yaml |multiple_choice |\n|mmlu_college_mathematics_generative |lm_eval/tasks/mmlu/generative/mmlu_college_mathematics.yaml |generate_until |\n|mmlu_college_medicine |lm_eval/tasks/mmlu/default/mmlu_college_medicine.yaml |multiple_choice |\n|mmlu_college_medicine_generative |lm_eval/tasks/mmlu/generative/mmlu_college_medicine.yaml |generate_until |\n|mmlu_college_physics |lm_eval/tasks/mmlu/default/mmlu_college_physics.yaml |multiple_choice |\n|mmlu_college_physics_generative |lm_eval/tasks/mmlu/generative/mmlu_college_physics.yaml |generate_until |\n|mmlu_computer_security |lm_eval/tasks/mmlu/default/mmlu_computer_security.yaml |multiple_choice |\n|mmlu_computer_security_generative |lm_eval/tasks/mmlu/generative/mmlu_computer_security.yaml |generate_until |\n|mmlu_conceptual_physics |lm_eval/tasks/mmlu/default/mmlu_conceptual_physics.yaml |multiple_choice |\n|mmlu_conceptual_physics_generative |lm_eval/tasks/mmlu/generative/mmlu_conceptual_physics.yaml |generate_until |\n|mmlu_continuation_abstract_algebra |lm_eval/tasks/mmlu/continuation/mmlu_abstract_algebra.yaml |multiple_choice |\n|mmlu_continuation_anatomy |lm_eval/tasks/mmlu/continuation/mmlu_anatomy.yaml |multiple_choice |\n|mmlu_continuation_astronomy |lm_eval/tasks/mmlu/continuation/mmlu_astronomy.yaml |multiple_choice |\n|mmlu_continuation_business_ethics |lm_eval/tasks/mmlu/continuation/mmlu_business_ethics.yaml |multiple_choice |\n|mmlu_continuation_clinical_knowledge |lm_eval/tasks/mmlu/continuation/mmlu_clinical_knowledge.yaml |multiple_choice |\n|mmlu_continuation_college_biology |lm_eval/tasks/mmlu/continuation/mmlu_college_biology.yaml |multiple_choice |\n|mmlu_continuation_college_chemistry |lm_eval/tasks/mmlu/continuation/mmlu_college_chemistry.yaml |multiple_choice |\n|mmlu_continuation_college_computer_science |lm_eval/tasks/mmlu/continuation/mmlu_college_computer_science.yaml |multiple_choice |\n|mmlu_continuation_college_mathematics |lm_eval/tasks/mmlu/continuation/mmlu_college_mathematics.yaml |multiple_choice |\n|mmlu_continuation_college_medicine |lm_eval/tasks/mmlu/continuation/mmlu_college_medicine.yaml |multiple_choice |\n|mmlu_continuation_college_physics |lm_eval/tasks/mmlu/continuation/mmlu_college_physics.yaml |multiple_choice |\n|mmlu_continuation_computer_security |lm_eval/tasks/mmlu/continuation/mmlu_computer_security.yaml |multiple_choice |\n|mmlu_continuation_conceptual_physics |lm_eval/tasks/mmlu/continuation/mmlu_conceptual_physics.yaml |multiple_choice |\n|mmlu_continuation_econometrics |lm_eval/tasks/mmlu/continuation/mmlu_econometrics.yaml |multiple_choice |\n|mmlu_continuation_electrical_engineering |lm_eval/tasks/mmlu/continuation/mmlu_electrical_engineering.yaml |multiple_choice |\n|mmlu_continuation_elementary_mathematics |lm_eval/tasks/mmlu/continuation/mmlu_elementary_mathematics.yaml |multiple_choice |\n|mmlu_continuation_formal_logic |lm_eval/tasks/mmlu/continuation/mmlu_formal_logic.yaml |multiple_choice |\n|mmlu_continuation_global_facts |lm_eval/tasks/mmlu/continuation/mmlu_global_facts.yaml |multiple_choice |\n|mmlu_continuation_high_school_biology |lm_eval/tasks/mmlu/continuation/mmlu_high_school_biology.yaml |multiple_choice |\n|mmlu_continuation_high_school_chemistry |lm_eval/tasks/mmlu/continuation/mmlu_high_school_chemistry.yaml |multiple_choice |\n|mmlu_continuation_high_school_computer_science |lm_eval/tasks/mmlu/continuation/mmlu_high_school_computer_science.yaml |multiple_choice |\n|mmlu_continuation_high_school_european_history |lm_eval/tasks/mmlu/continuation/mmlu_high_school_european_history.yaml |multiple_choice |\n|mmlu_continuation_high_school_geography |lm_eval/tasks/mmlu/continuation/mmlu_high_school_geography.yaml |multiple_choice |\n|mmlu_continuation_high_school_government_and_politics |lm_eval/tasks/mmlu/continuation/mmlu_high_school_government_and_politics.yaml |multiple_choice |\n|mmlu_continuation_high_school_macroeconomics |lm_eval/tasks/mmlu/continuation/mmlu_high_school_macroeconomics.yaml |multiple_choice |\n|mmlu_continuation_high_school_mathematics |lm_eval/tasks/mmlu/continuation/mmlu_high_school_mathematics.yaml |multiple_choice |\n|mmlu_continuation_high_school_microeconomics |lm_eval/tasks/mmlu/continuation/mmlu_high_school_microeconomics.yaml |multiple_choice |\n|mmlu_continuation_high_school_physics |lm_eval/tasks/mmlu/continuation/mmlu_high_school_physics.yaml |multiple_choice |\n|mmlu_continuation_high_school_psychology |lm_eval/tasks/mmlu/continuation/mmlu_high_school_psychology.yaml |multiple_choice |\n|mmlu_continuation_high_school_statistics |lm_eval/tasks/mmlu/continuation/mmlu_high_school_statistics.yaml |multiple_choice |\n|mmlu_continuation_high_school_us_history |lm_eval/tasks/mmlu/continuation/mmlu_high_school_us_history.yaml |multiple_choice |\n|mmlu_continuation_high_school_world_history |lm_eval/tasks/mmlu/continuation/mmlu_high_school_world_history.yaml |multiple_choice |\n|mmlu_continuation_human_aging |lm_eval/tasks/mmlu/continuation/mmlu_human_aging.yaml |multiple_choice |\n|mmlu_continuation_human_sexuality |lm_eval/tasks/mmlu/continuation/mmlu_human_sexuality.yaml |multiple_choice |\n|mmlu_continuation_international_law |lm_eval/tasks/mmlu/continuation/mmlu_international_law.yaml |multiple_choice |\n|mmlu_continuation_jurisprudence |lm_eval/tasks/mmlu/continuation/mmlu_jurisprudence.yaml |multiple_choice |\n|mmlu_continuation_logical_fallacies |lm_eval/tasks/mmlu/continuation/mmlu_logical_fallacies.yaml |multiple_choice |\n|mmlu_continuation_machine_learning |lm_eval/tasks/mmlu/continuation/mmlu_machine_learning.yaml |multiple_choice |\n|mmlu_continuation_management |lm_eval/tasks/mmlu/continuation/mmlu_management.yaml |multiple_choice |\n|mmlu_continuation_marketing |lm_eval/tasks/mmlu/continuation/mmlu_marketing.yaml |multiple_choice |\n|mmlu_continuation_medical_genetics |lm_eval/tasks/mmlu/continuation/mmlu_medical_genetics.yaml |multiple_choice |\n|mmlu_continuation_miscellaneous |lm_eval/tasks/mmlu/continuation/mmlu_miscellaneous.yaml |multiple_choice |\n|mmlu_continuation_moral_disputes |lm_eval/tasks/mmlu/continuation/mmlu_moral_disputes.yaml |multiple_choice |\n|mmlu_continuation_moral_scenarios |lm_eval/tasks/mmlu/continuation/mmlu_moral_scenarios.yaml |multiple_choice |\n|mmlu_continuation_nutrition |lm_eval/tasks/mmlu/continuation/mmlu_nutrition.yaml |multiple_choice |\n|mmlu_continuation_philosophy |lm_eval/tasks/mmlu/continuation/mmlu_philosophy.yaml |multiple_choice |\n|mmlu_continuation_prehistory |lm_eval/tasks/mmlu/continuation/mmlu_prehistory.yaml |multiple_choice |\n|mmlu_continuation_professional_accounting |lm_eval/tasks/mmlu/continuation/mmlu_professional_accounting.yaml |multiple_choice |\n|mmlu_continuation_professional_law |lm_eval/tasks/mmlu/continuation/mmlu_professional_law.yaml |multiple_choice |\n|mmlu_continuation_professional_medicine |lm_eval/tasks/mmlu/continuation/mmlu_professional_medicine.yaml |multiple_choice |\n|mmlu_continuation_professional_psychology |lm_eval/tasks/mmlu/continuation/mmlu_professional_psychology.yaml |multiple_choice |\n|mmlu_continuation_public_relations |lm_eval/tasks/mmlu/continuation/mmlu_public_relations.yaml |multiple_choice |\n|mmlu_continuation_security_studies |lm_eval/tasks/mmlu/continuation/mmlu_security_studies.yaml |multiple_choice |\n|mmlu_continuation_sociology |lm_eval/tasks/mmlu/continuation/mmlu_sociology.yaml |multiple_choice |\n|mmlu_continuation_us_foreign_policy |lm_eval/tasks/mmlu/continuation/mmlu_us_foreign_policy.yaml |multiple_choice |\n|mmlu_continuation_virology |lm_eval/tasks/mmlu/continuation/mmlu_virology.yaml |multiple_choice |\n|mmlu_continuation_world_religions |lm_eval/tasks/mmlu/continuation/mmlu_world_religions.yaml |multiple_choice |\n|mmlu_econometrics |lm_eval/tasks/mmlu/default/mmlu_econometrics.yaml |multiple_choice |\n|mmlu_econometrics_generative |lm_eval/tasks/mmlu/generative/mmlu_econometrics.yaml |generate_until |\n|mmlu_electrical_engineering |lm_eval/tasks/mmlu/default/mmlu_electrical_engineering.yaml |multiple_choice |\n|mmlu_electrical_engineering_generative |lm_eval/tasks/mmlu/generative/mmlu_electrical_engineering.yaml |generate_until |\n|mmlu_elementary_mathematics |lm_eval/tasks/mmlu/default/mmlu_elementary_mathematics.yaml |multiple_choice |\n|mmlu_elementary_mathematics_generative |lm_eval/tasks/mmlu/generative/mmlu_elementary_mathematics.yaml |generate_until |\n|mmlu_flan_cot_fewshot_abstract_algebra |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_abstract_algebra.yaml |generate_until |\n|mmlu_flan_cot_fewshot_anatomy |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_anatomy.yaml |generate_until |\n|mmlu_flan_cot_fewshot_astronomy |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_astronomy.yaml |generate_until |\n|mmlu_flan_cot_fewshot_business_ethics |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_business_ethics.yaml |generate_until |\n|mmlu_flan_cot_fewshot_clinical_knowledge |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_clinical_knowledge.yaml |generate_until |\n|mmlu_flan_cot_fewshot_college_biology |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_biology.yaml |generate_until |\n|mmlu_flan_cot_fewshot_college_chemistry |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_chemistry.yaml |generate_until |\n|mmlu_flan_cot_fewshot_college_computer_science |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_computer_science.yaml |generate_until |\n|mmlu_flan_cot_fewshot_college_mathematics |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_mathematics.yaml |generate_until |\n|mmlu_flan_cot_fewshot_college_medicine |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_medicine.yaml |generate_until |\n|mmlu_flan_cot_fewshot_college_physics |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_physics.yaml |generate_until |\n|mmlu_flan_cot_fewshot_computer_security |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_computer_security.yaml |generate_until |\n|mmlu_flan_cot_fewshot_conceptual_physics |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_conceptual_physics.yaml |generate_until |\n|mmlu_flan_cot_fewshot_econometrics |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_econometrics.yaml |generate_until |\n|mmlu_flan_cot_fewshot_electrical_engineering |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_electrical_engineering.yaml |generate_until |\n|mmlu_flan_cot_fewshot_elementary_mathematics |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_elementary_mathematics.yaml |generate_until |\n|mmlu_flan_cot_fewshot_formal_logic |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_formal_logic.yaml |generate_until |\n|mmlu_flan_cot_fewshot_global_facts |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_global_facts.yaml |generate_until |\n|mmlu_flan_cot_fewshot_high_school_biology |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_biology.yaml |generate_until |\n|mmlu_flan_cot_fewshot_high_school_chemistry |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_chemistry.yaml |generate_until |\n|mmlu_flan_cot_fewshot_high_school_computer_science |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_computer_science.yaml |generate_until |\n|mmlu_flan_cot_fewshot_high_school_european_history |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_european_history.yaml |generate_until |\n|mmlu_flan_cot_fewshot_high_school_geography |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_geography.yaml |generate_until |\n|mmlu_flan_cot_fewshot_high_school_government_and_politics |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_government_and_politics.yaml |generate_until |\n|mmlu_flan_cot_fewshot_high_school_macroeconomics |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_macroeconomics.yaml |generate_until |\n|mmlu_flan_cot_fewshot_high_school_mathematics |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_mathematics.yaml |generate_until |\n|mmlu_flan_cot_fewshot_high_school_microeconomics |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_microeconomics.yaml |generate_until |\n|mmlu_flan_cot_fewshot_high_school_physics |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_physics.yaml |generate_until |\n|mmlu_flan_cot_fewshot_high_school_psychology |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_psychology.yaml |generate_until |\n|mmlu_flan_cot_fewshot_high_school_statistics |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_statistics.yaml |generate_until |\n|mmlu_flan_cot_fewshot_high_school_us_history |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_us_history.yaml |generate_until |\n|mmlu_flan_cot_fewshot_high_school_world_history |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_high_school_world_history.yaml |generate_until |\n|mmlu_flan_cot_fewshot_human_aging |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_aging.yaml |generate_until |\n|mmlu_flan_cot_fewshot_human_sexuality |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_human_sexuality.yaml |generate_until |\n|mmlu_flan_cot_fewshot_international_law |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_international_law.yaml |generate_until |\n|mmlu_flan_cot_fewshot_jurisprudence |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_jurisprudence.yaml |generate_until |\n|mmlu_flan_cot_fewshot_logical_fallacies |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_logical_fallacies.yaml |generate_until |\n|mmlu_flan_cot_fewshot_machine_learning |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_machine_learning.yaml |generate_until |\n|mmlu_flan_cot_fewshot_management |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_management.yaml |generate_until |\n|mmlu_flan_cot_fewshot_marketing |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_marketing.yaml |generate_until |\n|mmlu_flan_cot_fewshot_medical_genetics |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_medical_genetics.yaml |generate_until |\n|mmlu_flan_cot_fewshot_miscellaneous |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_miscellaneous.yaml |generate_until |\n|mmlu_flan_cot_fewshot_moral_disputes |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_disputes.yaml |generate_until |\n|mmlu_flan_cot_fewshot_moral_scenarios |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_moral_scenarios.yaml |generate_until |\n|mmlu_flan_cot_fewshot_nutrition |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_nutrition.yaml |generate_until |\n|mmlu_flan_cot_fewshot_philosophy |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_philosophy.yaml |generate_until |\n|mmlu_flan_cot_fewshot_prehistory |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_prehistory.yaml |generate_until |\n|mmlu_flan_cot_fewshot_professional_accounting |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_accounting.yaml |generate_until |\n|mmlu_flan_cot_fewshot_professional_law |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_law.yaml |generate_until |\n|mmlu_flan_cot_fewshot_professional_medicine |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_medicine.yaml |generate_until |\n|mmlu_flan_cot_fewshot_professional_psychology |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_professional_psychology.yaml |generate_until |\n|mmlu_flan_cot_fewshot_public_relations |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_public_relations.yaml |generate_until |\n|mmlu_flan_cot_fewshot_security_studies |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_security_studies.yaml |generate_until |\n|mmlu_flan_cot_fewshot_sociology |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_sociology.yaml |generate_until |\n|mmlu_flan_cot_fewshot_us_foreign_policy |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_us_foreign_policy.yaml |generate_until |\n|mmlu_flan_cot_fewshot_virology |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_virology.yaml |generate_until |\n|mmlu_flan_cot_fewshot_world_religions |lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_world_religions.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_abstract_algebra |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_abstract_algebra.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_anatomy |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_anatomy.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_astronomy |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_astronomy.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_business_ethics |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_business_ethics.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_clinical_knowledge |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_clinical_knowledge.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_college_biology |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_biology.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_college_chemistry |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_chemistry.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_college_computer_science |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_computer_science.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_college_mathematics |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_mathematics.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_college_medicine |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_medicine.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_college_physics |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_college_physics.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_computer_security |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_computer_security.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_conceptual_physics |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_conceptual_physics.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_econometrics |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_econometrics.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_electrical_engineering |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_electrical_engineering.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_elementary_mathematics |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_elementary_mathematics.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_formal_logic |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_formal_logic.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_global_facts |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_global_facts.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_high_school_biology |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_biology.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_high_school_chemistry |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_chemistry.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_high_school_computer_science |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_computer_science.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_high_school_european_history |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_european_history.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_high_school_geography |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_geography.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_high_school_government_and_politics |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_government_and_politics.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_high_school_macroeconomics |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_macroeconomics.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_high_school_mathematics |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_mathematics.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_high_school_microeconomics |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_microeconomics.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_high_school_physics |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_physics.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_high_school_psychology |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_psychology.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_high_school_statistics |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_statistics.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_high_school_us_history |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_us_history.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_high_school_world_history |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_high_school_world_history.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_human_aging |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_aging.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_human_sexuality |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_human_sexuality.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_international_law |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_international_law.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_jurisprudence |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_jurisprudence.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_logical_fallacies |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_logical_fallacies.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_machine_learning |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_machine_learning.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_management |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_management.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_marketing |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_marketing.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_medical_genetics |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_medical_genetics.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_miscellaneous |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_miscellaneous.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_moral_disputes |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_disputes.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_moral_scenarios |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_moral_scenarios.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_nutrition |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_nutrition.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_philosophy |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_philosophy.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_prehistory |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_prehistory.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_professional_accounting |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_accounting.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_professional_law |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_law.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_professional_medicine |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_medicine.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_professional_psychology |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_professional_psychology.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_public_relations |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_public_relations.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_security_studies |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_security_studies.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_sociology |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_sociology.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_us_foreign_policy |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_us_foreign_policy.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_virology |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_virology.yaml |generate_until |\n|mmlu_flan_cot_zeroshot_world_religions |lm_eval/tasks/mmlu/flan_cot_zeroshot/mmlu_world_religions.yaml |generate_until |\n|mmlu_flan_n_shot_generative_abstract_algebra |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_abstract_algebra.yaml |generate_until |\n|mmlu_flan_n_shot_generative_anatomy |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_anatomy.yaml |generate_until |\n|mmlu_flan_n_shot_generative_astronomy |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_astronomy.yaml |generate_until |\n|mmlu_flan_n_shot_generative_business_ethics |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_business_ethics.yaml |generate_until |\n|mmlu_flan_n_shot_generative_clinical_knowledge |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_clinical_knowledge.yaml |generate_until |\n|mmlu_flan_n_shot_generative_college_biology |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_biology.yaml |generate_until |\n|mmlu_flan_n_shot_generative_college_chemistry |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_chemistry.yaml |generate_until |\n|mmlu_flan_n_shot_generative_college_computer_science |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_computer_science.yaml |generate_until |\n|mmlu_flan_n_shot_generative_college_mathematics |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_mathematics.yaml |generate_until |\n|mmlu_flan_n_shot_generative_college_medicine |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_medicine.yaml |generate_until |\n|mmlu_flan_n_shot_generative_college_physics |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_college_physics.yaml |generate_until |\n|mmlu_flan_n_shot_generative_computer_security |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_computer_security.yaml |generate_until |\n|mmlu_flan_n_shot_generative_conceptual_physics |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_conceptual_physics.yaml |generate_until |\n|mmlu_flan_n_shot_generative_econometrics |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_econometrics.yaml |generate_until |\n|mmlu_flan_n_shot_generative_electrical_engineering |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_electrical_engineering.yaml |generate_until |\n|mmlu_flan_n_shot_generative_elementary_mathematics |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_elementary_mathematics.yaml |generate_until |\n|mmlu_flan_n_shot_generative_formal_logic |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_formal_logic.yaml |generate_until |\n|mmlu_flan_n_shot_generative_global_facts |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_global_facts.yaml |generate_until |\n|mmlu_flan_n_shot_generative_high_school_biology |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_biology.yaml |generate_until |\n|mmlu_flan_n_shot_generative_high_school_chemistry |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_chemistry.yaml |generate_until |\n|mmlu_flan_n_shot_generative_high_school_computer_science |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_computer_science.yaml |generate_until |\n|mmlu_flan_n_shot_generative_high_school_european_history |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_european_history.yaml |generate_until |\n|mmlu_flan_n_shot_generative_high_school_geography |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_geography.yaml |generate_until |\n|mmlu_flan_n_shot_generative_high_school_government_and_politics |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_government_and_politics.yaml |generate_until |\n|mmlu_flan_n_shot_generative_high_school_macroeconomics |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_macroeconomics.yaml |generate_until |\n|mmlu_flan_n_shot_generative_high_school_mathematics |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_mathematics.yaml |generate_until |\n|mmlu_flan_n_shot_generative_high_school_microeconomics |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_microeconomics.yaml |generate_until |\n|mmlu_flan_n_shot_generative_high_school_physics |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_physics.yaml |generate_until |\n|mmlu_flan_n_shot_generative_high_school_psychology |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_psychology.yaml |generate_until |\n|mmlu_flan_n_shot_generative_high_school_statistics |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_statistics.yaml |generate_until |\n|mmlu_flan_n_shot_generative_high_school_us_history |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_us_history.yaml |generate_until |\n|mmlu_flan_n_shot_generative_high_school_world_history |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_high_school_world_history.yaml |generate_until |\n|mmlu_flan_n_shot_generative_human_aging |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_human_aging.yaml |generate_until |\n|mmlu_flan_n_shot_generative_human_sexuality |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_human_sexuality.yaml |generate_until |\n|mmlu_flan_n_shot_generative_international_law |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_international_law.yaml |generate_until |\n|mmlu_flan_n_shot_generative_jurisprudence |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_jurisprudence.yaml |generate_until |\n|mmlu_flan_n_shot_generative_logical_fallacies |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_logical_fallacies.yaml |generate_until |\n|mmlu_flan_n_shot_generative_machine_learning |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_machine_learning.yaml |generate_until |\n|mmlu_flan_n_shot_generative_management |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_management.yaml |generate_until |\n|mmlu_flan_n_shot_generative_marketing |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_marketing.yaml |generate_until |\n|mmlu_flan_n_shot_generative_medical_genetics |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_medical_genetics.yaml |generate_until |\n|mmlu_flan_n_shot_generative_miscellaneous |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_miscellaneous.yaml |generate_until |\n|mmlu_flan_n_shot_generative_moral_disputes |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_moral_disputes.yaml |generate_until |\n|mmlu_flan_n_shot_generative_moral_scenarios |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_moral_scenarios.yaml |generate_until |\n|mmlu_flan_n_shot_generative_nutrition |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_nutrition.yaml |generate_until |\n|mmlu_flan_n_shot_generative_philosophy |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_philosophy.yaml |generate_until |\n|mmlu_flan_n_shot_generative_prehistory |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_prehistory.yaml |generate_until |\n|mmlu_flan_n_shot_generative_professional_accounting |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_accounting.yaml |generate_until |\n|mmlu_flan_n_shot_generative_professional_law |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_law.yaml |generate_until |\n|mmlu_flan_n_shot_generative_professional_medicine |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_medicine.yaml |generate_until |\n|mmlu_flan_n_shot_generative_professional_psychology |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_professional_psychology.yaml |generate_until |\n|mmlu_flan_n_shot_generative_public_relations |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_public_relations.yaml |generate_until |\n|mmlu_flan_n_shot_generative_security_studies |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_security_studies.yaml |generate_until |\n|mmlu_flan_n_shot_generative_sociology |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_sociology.yaml |generate_until |\n|mmlu_flan_n_shot_generative_us_foreign_policy |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_us_foreign_policy.yaml |generate_until |\n|mmlu_flan_n_shot_generative_virology |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_virology.yaml |generate_until |\n|mmlu_flan_n_shot_generative_world_religions |lm_eval/tasks/mmlu/flan_n_shot/generative/mmlu_world_religions.yaml |generate_until |\n|mmlu_flan_n_shot_loglikelihood_abstract_algebra |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_abstract_algebra.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_anatomy |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_anatomy.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_astronomy |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_astronomy.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_business_ethics |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_business_ethics.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_clinical_knowledge |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_clinical_knowledge.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_college_biology |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_biology.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_college_chemistry |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_chemistry.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_college_computer_science |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_computer_science.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_college_mathematics |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_mathematics.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_college_medicine |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_medicine.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_college_physics |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_college_physics.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_computer_security |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_computer_security.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_conceptual_physics |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_conceptual_physics.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_econometrics |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_econometrics.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_electrical_engineering |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_electrical_engineering.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_elementary_mathematics |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_elementary_mathematics.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_formal_logic |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_formal_logic.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_global_facts |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_global_facts.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_high_school_biology |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_biology.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_high_school_chemistry |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_chemistry.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_high_school_computer_science |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_computer_science.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_high_school_european_history |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_european_history.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_high_school_geography |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_geography.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_high_school_government_and_politics |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_government_and_politics.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_high_school_macroeconomics |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_macroeconomics.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_high_school_mathematics |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_mathematics.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_high_school_microeconomics |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_microeconomics.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_high_school_physics |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_physics.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_high_school_psychology |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_psychology.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_high_school_statistics |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_statistics.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_high_school_us_history |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_us_history.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_high_school_world_history |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_high_school_world_history.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_human_aging |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_aging.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_human_sexuality |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_human_sexuality.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_international_law |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_international_law.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_jurisprudence |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_jurisprudence.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_logical_fallacies |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_logical_fallacies.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_machine_learning |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_machine_learning.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_management |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_management.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_marketing |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_marketing.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_medical_genetics |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_medical_genetics.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_miscellaneous |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_miscellaneous.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_moral_disputes |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_moral_disputes.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_moral_scenarios |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_moral_scenarios.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_nutrition |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_nutrition.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_philosophy |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_philosophy.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_prehistory |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_prehistory.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_professional_accounting |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_accounting.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_professional_law |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_law.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_professional_medicine |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_medicine.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_professional_psychology |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_professional_psychology.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_public_relations |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_public_relations.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_security_studies |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_security_studies.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_sociology |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_sociology.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_us_foreign_policy |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_us_foreign_policy.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_virology |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_virology.yaml |multiple_choice |\n|mmlu_flan_n_shot_loglikelihood_world_religions |lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/mmlu_world_religions.yaml |multiple_choice |\n|mmlu_formal_logic |lm_eval/tasks/mmlu/default/mmlu_formal_logic.yaml |multiple_choice |\n|mmlu_formal_logic_generative |lm_eval/tasks/mmlu/generative/mmlu_formal_logic.yaml |generate_until |\n|mmlu_global_facts |lm_eval/tasks/mmlu/default/mmlu_global_facts.yaml |multiple_choice |\n|mmlu_global_facts_generative |lm_eval/tasks/mmlu/generative/mmlu_global_facts.yaml |generate_until |\n|mmlu_high_school_biology |lm_eval/tasks/mmlu/default/mmlu_high_school_biology.yaml |multiple_choice |\n|mmlu_high_school_biology_generative |lm_eval/tasks/mmlu/generative/mmlu_high_school_biology.yaml |generate_until |\n|mmlu_high_school_chemistry |lm_eval/tasks/mmlu/default/mmlu_high_school_chemistry.yaml |multiple_choice |\n|mmlu_high_school_chemistry_generative |lm_eval/tasks/mmlu/generative/mmlu_high_school_chemistry.yaml |generate_until |\n|mmlu_high_school_computer_science |lm_eval/tasks/mmlu/default/mmlu_high_school_computer_science.yaml |multiple_choice |\n|mmlu_high_school_computer_science_generative |lm_eval/tasks/mmlu/generative/mmlu_high_school_computer_science.yaml |generate_until |\n|mmlu_high_school_european_history |lm_eval/tasks/mmlu/default/mmlu_high_school_european_history.yaml |multiple_choice |\n|mmlu_high_school_european_history_generative |lm_eval/tasks/mmlu/generative/mmlu_high_school_european_history.yaml |generate_until |\n|mmlu_high_school_geography |lm_eval/tasks/mmlu/default/mmlu_high_school_geography.yaml |multiple_choice |\n|mmlu_high_school_geography_generative |lm_eval/tasks/mmlu/generative/mmlu_high_school_geography.yaml |generate_until |\n|mmlu_high_school_government_and_politics |lm_eval/tasks/mmlu/default/mmlu_high_school_government_and_politics.yaml |multiple_choice |\n|mmlu_high_school_government_and_politics_generative |lm_eval/tasks/mmlu/generative/mmlu_high_school_government_and_politics.yaml |generate_until |\n|mmlu_high_school_macroeconomics |lm_eval/tasks/mmlu/default/mmlu_high_school_macroeconomics.yaml |multiple_choice |\n|mmlu_high_school_macroeconomics_generative |lm_eval/tasks/mmlu/generative/mmlu_high_school_macroeconomics.yaml |generate_until |\n|mmlu_high_school_mathematics |lm_eval/tasks/mmlu/default/mmlu_high_school_mathematics.yaml |multiple_choice |\n|mmlu_high_school_mathematics_generative |lm_eval/tasks/mmlu/generative/mmlu_high_school_mathematics.yaml |generate_until |\n|mmlu_high_school_microeconomics |lm_eval/tasks/mmlu/default/mmlu_high_school_microeconomics.yaml |multiple_choice |\n|mmlu_high_school_microeconomics_generative |lm_eval/tasks/mmlu/generative/mmlu_high_school_microeconomics.yaml |generate_until |\n|mmlu_high_school_physics |lm_eval/tasks/mmlu/default/mmlu_high_school_physics.yaml |multiple_choice |\n|mmlu_high_school_physics_generative |lm_eval/tasks/mmlu/generative/mmlu_high_school_physics.yaml |generate_until |\n|mmlu_high_school_psychology |lm_eval/tasks/mmlu/default/mmlu_high_school_psychology.yaml |multiple_choice |\n|mmlu_high_school_psychology_generative |lm_eval/tasks/mmlu/generative/mmlu_high_school_psychology.yaml |generate_until |\n|mmlu_high_school_statistics |lm_eval/tasks/mmlu/default/mmlu_high_school_statistics.yaml |multiple_choice |\n|mmlu_high_school_statistics_generative |lm_eval/tasks/mmlu/generative/mmlu_high_school_statistics.yaml |generate_until |\n|mmlu_high_school_us_history |lm_eval/tasks/mmlu/default/mmlu_high_school_us_history.yaml |multiple_choice |\n|mmlu_high_school_us_history_generative |lm_eval/tasks/mmlu/generative/mmlu_high_school_us_history.yaml |generate_until |\n|mmlu_high_school_world_history |lm_eval/tasks/mmlu/default/mmlu_high_school_world_history.yaml |multiple_choice |\n|mmlu_high_school_world_history_generative |lm_eval/tasks/mmlu/generative/mmlu_high_school_world_history.yaml |generate_until |\n|mmlu_human_aging |lm_eval/tasks/mmlu/default/mmlu_human_aging.yaml |multiple_choice |\n|mmlu_human_aging_generative |lm_eval/tasks/mmlu/generative/mmlu_human_aging.yaml |generate_until |\n|mmlu_human_sexuality |lm_eval/tasks/mmlu/default/mmlu_human_sexuality.yaml |multiple_choice |\n|mmlu_human_sexuality_generative |lm_eval/tasks/mmlu/generative/mmlu_human_sexuality.yaml |generate_until |\n|mmlu_international_law |lm_eval/tasks/mmlu/default/mmlu_international_law.yaml |multiple_choice |\n|mmlu_international_law_generative |lm_eval/tasks/mmlu/generative/mmlu_international_law.yaml |generate_until |\n|mmlu_jurisprudence |lm_eval/tasks/mmlu/default/mmlu_jurisprudence.yaml |multiple_choice |\n|mmlu_jurisprudence_generative |lm_eval/tasks/mmlu/generative/mmlu_jurisprudence.yaml |generate_until |\n|mmlu_logical_fallacies |lm_eval/tasks/mmlu/default/mmlu_logical_fallacies.yaml |multiple_choice |\n|mmlu_logical_fallacies_generative |lm_eval/tasks/mmlu/generative/mmlu_logical_fallacies.yaml |generate_until |\n|mmlu_machine_learning |lm_eval/tasks/mmlu/default/mmlu_machine_learning.yaml |multiple_choice |\n|mmlu_machine_learning_generative |lm_eval/tasks/mmlu/generative/mmlu_machine_learning.yaml |generate_until |\n|mmlu_management |lm_eval/tasks/mmlu/default/mmlu_management.yaml |multiple_choice |\n|mmlu_management_generative |lm_eval/tasks/mmlu/generative/mmlu_management.yaml |generate_until |\n|mmlu_marketing |lm_eval/tasks/mmlu/default/mmlu_marketing.yaml |multiple_choice |\n|mmlu_marketing_generative |lm_eval/tasks/mmlu/generative/mmlu_marketing.yaml |generate_until |\n|mmlu_medical_genetics |lm_eval/tasks/mmlu/default/mmlu_medical_genetics.yaml |multiple_choice |\n|mmlu_medical_genetics_generative |lm_eval/tasks/mmlu/generative/mmlu_medical_genetics.yaml |generate_until |\n|mmlu_miscellaneous |lm_eval/tasks/mmlu/default/mmlu_miscellaneous.yaml |multiple_choice |\n|mmlu_miscellaneous_generative |lm_eval/tasks/mmlu/generative/mmlu_miscellaneous.yaml |generate_until |\n|mmlu_moral_disputes |lm_eval/tasks/mmlu/default/mmlu_moral_disputes.yaml |multiple_choice |\n|mmlu_moral_disputes_generative |lm_eval/tasks/mmlu/generative/mmlu_moral_disputes.yaml |generate_until |\n|mmlu_moral_scenarios |lm_eval/tasks/mmlu/default/mmlu_moral_scenarios.yaml |multiple_choice |\n|mmlu_moral_scenarios_generative |lm_eval/tasks/mmlu/generative/mmlu_moral_scenarios.yaml |generate_until |\n|mmlu_nutrition |lm_eval/tasks/mmlu/default/mmlu_nutrition.yaml |multiple_choice |\n|mmlu_nutrition_generative |lm_eval/tasks/mmlu/generative/mmlu_nutrition.yaml |generate_until |\n|mmlu_philosophy |lm_eval/tasks/mmlu/default/mmlu_philosophy.yaml |multiple_choice |\n|mmlu_philosophy_generative |lm_eval/tasks/mmlu/generative/mmlu_philosophy.yaml |generate_until |\n|mmlu_prehistory |lm_eval/tasks/mmlu/default/mmlu_prehistory.yaml |multiple_choice |\n|mmlu_prehistory_generative |lm_eval/tasks/mmlu/generative/mmlu_prehistory.yaml |generate_until |\n|mmlu_pro_biology |lm_eval/tasks/mmlu_pro/mmlu_pro_biology.yaml |generate_until |\n|mmlu_pro_business |lm_eval/tasks/mmlu_pro/mmlu_pro_business.yaml |generate_until |\n|mmlu_pro_chemistry |lm_eval/tasks/mmlu_pro/mmlu_pro_chemistry.yaml |generate_until |\n|mmlu_pro_computer_science |lm_eval/tasks/mmlu_pro/mmlu_pro_computer_science.yaml |generate_until |\n|mmlu_pro_economics |lm_eval/tasks/mmlu_pro/mmlu_pro_economics.yaml |generate_until |\n|mmlu_pro_engineering |lm_eval/tasks/mmlu_pro/mmlu_pro_engineering.yaml |generate_until |\n|mmlu_pro_health |lm_eval/tasks/mmlu_pro/mmlu_pro_health.yaml |generate_until |\n|mmlu_pro_history |lm_eval/tasks/mmlu_pro/mmlu_pro_history.yaml |generate_until |\n|mmlu_pro_law |lm_eval/tasks/mmlu_pro/mmlu_pro_law.yaml |generate_until |\n|mmlu_pro_math |lm_eval/tasks/mmlu_pro/mmlu_pro_math.yaml |generate_until |\n|mmlu_pro_other |lm_eval/tasks/mmlu_pro/mmlu_pro_other.yaml |generate_until |\n|mmlu_pro_philosophy |lm_eval/tasks/mmlu_pro/mmlu_pro_philosophy.yaml |generate_until |\n|mmlu_pro_physics |lm_eval/tasks/mmlu_pro/mmlu_pro_physics.yaml |generate_until |\n|mmlu_pro_psychology |lm_eval/tasks/mmlu_pro/mmlu_pro_psychology.yaml |generate_until |\n|mmlu_professional_accounting |lm_eval/tasks/mmlu/default/mmlu_professional_accounting.yaml |multiple_choice |\n|mmlu_professional_accounting_generative |lm_eval/tasks/mmlu/generative/mmlu_professional_accounting.yaml |generate_until |\n|mmlu_professional_law |lm_eval/tasks/mmlu/default/mmlu_professional_law.yaml |multiple_choice |\n|mmlu_professional_law_generative |lm_eval/tasks/mmlu/generative/mmlu_professional_law.yaml |generate_until |\n|mmlu_professional_medicine |lm_eval/tasks/mmlu/default/mmlu_professional_medicine.yaml |multiple_choice |\n|mmlu_professional_medicine_generative |lm_eval/tasks/mmlu/generative/mmlu_professional_medicine.yaml |generate_until |\n|mmlu_professional_psychology |lm_eval/tasks/mmlu/default/mmlu_professional_psychology.yaml |multiple_choice |\n|mmlu_professional_psychology_generative |lm_eval/tasks/mmlu/generative/mmlu_professional_psychology.yaml |generate_until |\n|mmlu_public_relations |lm_eval/tasks/mmlu/default/mmlu_public_relations.yaml |multiple_choice |\n|mmlu_public_relations_generative |lm_eval/tasks/mmlu/generative/mmlu_public_relations.yaml |generate_until |\n|mmlu_security_studies |lm_eval/tasks/mmlu/default/mmlu_security_studies.yaml |multiple_choice |\n|mmlu_security_studies_generative |lm_eval/tasks/mmlu/generative/mmlu_security_studies.yaml |generate_until |\n|mmlu_sociology |lm_eval/tasks/mmlu/default/mmlu_sociology.yaml |multiple_choice |\n|mmlu_sociology_generative |lm_eval/tasks/mmlu/generative/mmlu_sociology.yaml |generate_until |\n|mmlu_us_foreign_policy |lm_eval/tasks/mmlu/default/mmlu_us_foreign_policy.yaml |multiple_choice |\n|mmlu_us_foreign_policy_generative |lm_eval/tasks/mmlu/generative/mmlu_us_foreign_policy.yaml |generate_until |\n|mmlu_virology |lm_eval/tasks/mmlu/default/mmlu_virology.yaml |multiple_choice |\n|mmlu_virology_generative |lm_eval/tasks/mmlu/generative/mmlu_virology.yaml |generate_until |\n|mmlu_world_religions |lm_eval/tasks/mmlu/default/mmlu_world_religions.yaml |multiple_choice |\n|mmlu_world_religions_generative |lm_eval/tasks/mmlu/generative/mmlu_world_religions.yaml |generate_until |\n|mmlusr_answer_only_abstract_algebra |lm_eval/tasks/mmlusr/answer_only/answer_only_abstract_algebra.yaml |multiple_choice |\n|mmlusr_answer_only_anatomy |lm_eval/tasks/mmlusr/answer_only/answer_only_anatomy.yaml |multiple_choice |\n|mmlusr_answer_only_astronomy |lm_eval/tasks/mmlusr/answer_only/answer_only_astronomy.yaml |multiple_choice |\n|mmlusr_answer_only_business_ethics |lm_eval/tasks/mmlusr/answer_only/answer_only_business_ethics.yaml |multiple_choice |\n|mmlusr_answer_only_clinical_knowledge |lm_eval/tasks/mmlusr/answer_only/answer_only_clinical_knowledge.yaml |multiple_choice |\n|mmlusr_answer_only_college_biology |lm_eval/tasks/mmlusr/answer_only/answer_only_college_biology.yaml |multiple_choice |\n|mmlusr_answer_only_college_chemistry |lm_eval/tasks/mmlusr/answer_only/answer_only_college_chemistry.yaml |multiple_choice |\n|mmlusr_answer_only_college_computer_science |lm_eval/tasks/mmlusr/answer_only/answer_only_college_computer_science.yaml |multiple_choice |\n|mmlusr_answer_only_college_mathematics |lm_eval/tasks/mmlusr/answer_only/answer_only_college_mathematics.yaml |multiple_choice |\n|mmlusr_answer_only_college_medicine |lm_eval/tasks/mmlusr/answer_only/answer_only_college_medicine.yaml |multiple_choice |\n|mmlusr_answer_only_college_physics |lm_eval/tasks/mmlusr/answer_only/answer_only_college_physics.yaml |multiple_choice |\n|mmlusr_answer_only_computer_security |lm_eval/tasks/mmlusr/answer_only/answer_only_computer_security.yaml |multiple_choice |\n|mmlusr_answer_only_conceptual_physics |lm_eval/tasks/mmlusr/answer_only/answer_only_conceptual_physics.yaml |multiple_choice |\n|mmlusr_answer_only_econometrics |lm_eval/tasks/mmlusr/answer_only/answer_only_econometrics.yaml |multiple_choice |\n|mmlusr_answer_only_electrical_engineering |lm_eval/tasks/mmlusr/answer_only/answer_only_electrical_engineering.yaml |multiple_choice |\n|mmlusr_answer_only_elementary_mathematics |lm_eval/tasks/mmlusr/answer_only/answer_only_elementary_mathematics.yaml |multiple_choice |\n|mmlusr_answer_only_formal_logic |lm_eval/tasks/mmlusr/answer_only/answer_only_formal_logic.yaml |multiple_choice |\n|mmlusr_answer_only_global_facts |lm_eval/tasks/mmlusr/answer_only/answer_only_global_facts.yaml |multiple_choice |\n|mmlusr_answer_only_high_school_biology |lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_biology.yaml |multiple_choice |\n|mmlusr_answer_only_high_school_chemistry |lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_chemistry.yaml |multiple_choice |\n|mmlusr_answer_only_high_school_computer_science |lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_computer_science.yaml |multiple_choice |\n|mmlusr_answer_only_high_school_european_history |lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_european_history.yaml |multiple_choice |\n|mmlusr_answer_only_high_school_geography |lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_geography.yaml |multiple_choice |\n|mmlusr_answer_only_high_school_government_and_politics |lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_government_and_politics.yaml |multiple_choice |\n|mmlusr_answer_only_high_school_macroeconomics |lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_macroeconomics.yaml |multiple_choice |\n|mmlusr_answer_only_high_school_mathematics |lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_mathematics.yaml |multiple_choice |\n|mmlusr_answer_only_high_school_microeconomics |lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_microeconomics.yaml |multiple_choice |\n|mmlusr_answer_only_high_school_physics |lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_physics.yaml |multiple_choice |\n|mmlusr_answer_only_high_school_psychology |lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_psychology.yaml |multiple_choice |\n|mmlusr_answer_only_high_school_statistics |lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_statistics.yaml |multiple_choice |\n|mmlusr_answer_only_high_school_us_history |lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_us_history.yaml |multiple_choice |\n|mmlusr_answer_only_high_school_world_history |lm_eval/tasks/mmlusr/answer_only/answer_only_high_school_world_history.yaml |multiple_choice |\n|mmlusr_answer_only_human_aging |lm_eval/tasks/mmlusr/answer_only/answer_only_human_aging.yaml |multiple_choice |\n|mmlusr_answer_only_human_sexuality |lm_eval/tasks/mmlusr/answer_only/answer_only_human_sexuality.yaml |multiple_choice |\n|mmlusr_answer_only_international_law |lm_eval/tasks/mmlusr/answer_only/answer_only_international_law.yaml |multiple_choice |\n|mmlusr_answer_only_jurisprudence |lm_eval/tasks/mmlusr/answer_only/answer_only_jurisprudence.yaml |multiple_choice |\n|mmlusr_answer_only_logical_fallacies |lm_eval/tasks/mmlusr/answer_only/answer_only_logical_fallacies.yaml |multiple_choice |\n|mmlusr_answer_only_machine_learning |lm_eval/tasks/mmlusr/answer_only/answer_only_machine_learning.yaml |multiple_choice |\n|mmlusr_answer_only_management |lm_eval/tasks/mmlusr/answer_only/answer_only_management.yaml |multiple_choice |\n|mmlusr_answer_only_marketing |lm_eval/tasks/mmlusr/answer_only/answer_only_marketing.yaml |multiple_choice |\n|mmlusr_answer_only_medical_genetics |lm_eval/tasks/mmlusr/answer_only/answer_only_medical_genetics.yaml |multiple_choice |\n|mmlusr_answer_only_miscellaneous |lm_eval/tasks/mmlusr/answer_only/answer_only_miscellaneous.yaml |multiple_choice |\n|mmlusr_answer_only_moral_disputes |lm_eval/tasks/mmlusr/answer_only/answer_only_moral_disputes.yaml |multiple_choice |\n|mmlusr_answer_only_moral_scenarios |lm_eval/tasks/mmlusr/answer_only/answer_only_moral_scenarios.yaml |multiple_choice |\n|mmlusr_answer_only_nutrition |lm_eval/tasks/mmlusr/answer_only/answer_only_nutrition.yaml |multiple_choice |\n|mmlusr_answer_only_philosophy |lm_eval/tasks/mmlusr/answer_only/answer_only_philosophy.yaml |multiple_choice |\n|mmlusr_answer_only_prehistory |lm_eval/tasks/mmlusr/answer_only/answer_only_prehistory.yaml |multiple_choice |\n|mmlusr_answer_only_professional_accounting |lm_eval/tasks/mmlusr/answer_only/answer_only_professional_accounting.yaml |multiple_choice |\n|mmlusr_answer_only_professional_law |lm_eval/tasks/mmlusr/answer_only/answer_only_professional_law.yaml |multiple_choice |\n|mmlusr_answer_only_professional_medicine |lm_eval/tasks/mmlusr/answer_only/answer_only_professional_medicine.yaml |multiple_choice |\n|mmlusr_answer_only_professional_psychology |lm_eval/tasks/mmlusr/answer_only/answer_only_professional_psychology.yaml |multiple_choice |\n|mmlusr_answer_only_public_relations |lm_eval/tasks/mmlusr/answer_only/answer_only_public_relations.yaml |multiple_choice |\n|mmlusr_answer_only_security_studies |lm_eval/tasks/mmlusr/answer_only/answer_only_security_studies.yaml |multiple_choice |\n|mmlusr_answer_only_sociology |lm_eval/tasks/mmlusr/answer_only/answer_only_sociology.yaml |multiple_choice |\n|mmlusr_answer_only_us_foreign_policy |lm_eval/tasks/mmlusr/answer_only/answer_only_us_foreign_policy.yaml |multiple_choice |\n|mmlusr_answer_only_virology |lm_eval/tasks/mmlusr/answer_only/answer_only_virology.yaml |multiple_choice |\n|mmlusr_answer_only_world_religions |lm_eval/tasks/mmlusr/answer_only/answer_only_world_religions.yaml |multiple_choice |\n|mmlusr_question_and_answer_abstract_algebra |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_abstract_algebra.yaml |multiple_choice |\n|mmlusr_question_and_answer_anatomy |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_anatomy.yaml |multiple_choice |\n|mmlusr_question_and_answer_astronomy |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_astronomy.yaml |multiple_choice |\n|mmlusr_question_and_answer_business_ethics |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_business_ethics.yaml |multiple_choice |\n|mmlusr_question_and_answer_clinical_knowledge |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_clinical_knowledge.yaml |multiple_choice |\n|mmlusr_question_and_answer_college_biology |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_biology.yaml |multiple_choice |\n|mmlusr_question_and_answer_college_chemistry |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_chemistry.yaml |multiple_choice |\n|mmlusr_question_and_answer_college_computer_science |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_computer_science.yaml |multiple_choice |\n|mmlusr_question_and_answer_college_mathematics |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_mathematics.yaml |multiple_choice |\n|mmlusr_question_and_answer_college_medicine |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_medicine.yaml |multiple_choice |\n|mmlusr_question_and_answer_college_physics |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_college_physics.yaml |multiple_choice |\n|mmlusr_question_and_answer_computer_security |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_computer_security.yaml |multiple_choice |\n|mmlusr_question_and_answer_conceptual_physics |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_conceptual_physics.yaml |multiple_choice |\n|mmlusr_question_and_answer_econometrics |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_econometrics.yaml |multiple_choice |\n|mmlusr_question_and_answer_electrical_engineering |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_electrical_engineering.yaml |multiple_choice |\n|mmlusr_question_and_answer_elementary_mathematics |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_elementary_mathematics.yaml |multiple_choice |\n|mmlusr_question_and_answer_formal_logic |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_formal_logic.yaml |multiple_choice |\n|mmlusr_question_and_answer_global_facts |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_global_facts.yaml |multiple_choice |\n|mmlusr_question_and_answer_high_school_biology |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_biology.yaml |multiple_choice |\n|mmlusr_question_and_answer_high_school_chemistry |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_chemistry.yaml |multiple_choice |\n|mmlusr_question_and_answer_high_school_computer_science |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_computer_science.yaml |multiple_choice |\n|mmlusr_question_and_answer_high_school_european_history |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_european_history.yaml |multiple_choice |\n|mmlusr_question_and_answer_high_school_geography |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_geography.yaml |multiple_choice |\n|mmlusr_question_and_answer_high_school_government_and_politics |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_government_and_politics.yaml |multiple_choice |\n|mmlusr_question_and_answer_high_school_macroeconomics |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_macroeconomics.yaml |multiple_choice |\n|mmlusr_question_and_answer_high_school_mathematics |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_mathematics.yaml |multiple_choice |\n|mmlusr_question_and_answer_high_school_microeconomics |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_microeconomics.yaml |multiple_choice |\n|mmlusr_question_and_answer_high_school_physics |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_physics.yaml |multiple_choice |\n|mmlusr_question_and_answer_high_school_psychology |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_psychology.yaml |multiple_choice |\n|mmlusr_question_and_answer_high_school_statistics |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_statistics.yaml |multiple_choice |\n|mmlusr_question_and_answer_high_school_us_history |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_us_history.yaml |multiple_choice |\n|mmlusr_question_and_answer_high_school_world_history |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_high_school_world_history.yaml |multiple_choice |\n|mmlusr_question_and_answer_human_aging |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_human_aging.yaml |multiple_choice |\n|mmlusr_question_and_answer_human_sexuality |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_human_sexuality.yaml |multiple_choice |\n|mmlusr_question_and_answer_international_law |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_international_law.yaml |multiple_choice |\n|mmlusr_question_and_answer_jurisprudence |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_jurisprudence.yaml |multiple_choice |\n|mmlusr_question_and_answer_logical_fallacies |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_logical_fallacies.yaml |multiple_choice |\n|mmlusr_question_and_answer_machine_learning |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_machine_learning.yaml |multiple_choice |\n|mmlusr_question_and_answer_management |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_management.yaml |multiple_choice |\n|mmlusr_question_and_answer_marketing |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_marketing.yaml |multiple_choice |\n|mmlusr_question_and_answer_medical_genetics |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_medical_genetics.yaml |multiple_choice |\n|mmlusr_question_and_answer_miscellaneous |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_miscellaneous.yaml |multiple_choice |\n|mmlusr_question_and_answer_moral_disputes |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_moral_disputes.yaml |multiple_choice |\n|mmlusr_question_and_answer_moral_scenarios |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_moral_scenarios.yaml |multiple_choice |\n|mmlusr_question_and_answer_nutrition |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_nutrition.yaml |multiple_choice |\n|mmlusr_question_and_answer_philosophy |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_philosophy.yaml |multiple_choice |\n|mmlusr_question_and_answer_prehistory |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_prehistory.yaml |multiple_choice |\n|mmlusr_question_and_answer_professional_accounting |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_professional_accounting.yaml |multiple_choice |\n|mmlusr_question_and_answer_professional_law |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_professional_law.yaml |multiple_choice |\n|mmlusr_question_and_answer_professional_medicine |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_professional_medicine.yaml |multiple_choice |\n|mmlusr_question_and_answer_professional_psychology |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_professional_psychology.yaml |multiple_choice |\n|mmlusr_question_and_answer_public_relations |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_public_relations.yaml |multiple_choice |\n|mmlusr_question_and_answer_security_studies |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_security_studies.yaml |multiple_choice |\n|mmlusr_question_and_answer_sociology |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_sociology.yaml |multiple_choice |\n|mmlusr_question_and_answer_us_foreign_policy |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_us_foreign_policy.yaml |multiple_choice |\n|mmlusr_question_and_answer_virology |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_virology.yaml |multiple_choice |\n|mmlusr_question_and_answer_world_religions |lm_eval/tasks/mmlusr/question_and_answer/question_and_answer_world_religions.yaml |multiple_choice |\n|mmlusr_question_only_abstract_algebra |lm_eval/tasks/mmlusr/question_only/question_only_abstract_algebra.yaml |multiple_choice |\n|mmlusr_question_only_anatomy |lm_eval/tasks/mmlusr/question_only/question_only_anatomy.yaml |multiple_choice |\n|mmlusr_question_only_astronomy |lm_eval/tasks/mmlusr/question_only/question_only_astronomy.yaml |multiple_choice |\n|mmlusr_question_only_business_ethics |lm_eval/tasks/mmlusr/question_only/question_only_business_ethics.yaml |multiple_choice |\n|mmlusr_question_only_clinical_knowledge |lm_eval/tasks/mmlusr/question_only/question_only_clinical_knowledge.yaml |multiple_choice |\n|mmlusr_question_only_college_biology |lm_eval/tasks/mmlusr/question_only/question_only_college_biology.yaml |multiple_choice |\n|mmlusr_question_only_college_chemistry |lm_eval/tasks/mmlusr/question_only/question_only_college_chemistry.yaml |multiple_choice |\n|mmlusr_question_only_college_computer_science |lm_eval/tasks/mmlusr/question_only/question_only_college_computer_science.yaml |multiple_choice |\n|mmlusr_question_only_college_mathematics |lm_eval/tasks/mmlusr/question_only/question_only_college_mathematics.yaml |multiple_choice |\n|mmlusr_question_only_college_medicine |lm_eval/tasks/mmlusr/question_only/question_only_college_medicine.yaml |multiple_choice |\n|mmlusr_question_only_college_physics |lm_eval/tasks/mmlusr/question_only/question_only_college_physics.yaml |multiple_choice |\n|mmlusr_question_only_computer_security |lm_eval/tasks/mmlusr/question_only/question_only_computer_security.yaml |multiple_choice |\n|mmlusr_question_only_conceptual_physics |lm_eval/tasks/mmlusr/question_only/question_only_conceptual_physics.yaml |multiple_choice |\n|mmlusr_question_only_econometrics |lm_eval/tasks/mmlusr/question_only/question_only_econometrics.yaml |multiple_choice |\n|mmlusr_question_only_electrical_engineering |lm_eval/tasks/mmlusr/question_only/question_only_electrical_engineering.yaml |multiple_choice |\n|mmlusr_question_only_elementary_mathematics |lm_eval/tasks/mmlusr/question_only/question_only_elementary_mathematics.yaml |multiple_choice |\n|mmlusr_question_only_formal_logic |lm_eval/tasks/mmlusr/question_only/question_only_formal_logic.yaml |multiple_choice |\n|mmlusr_question_only_global_facts |lm_eval/tasks/mmlusr/question_only/question_only_global_facts.yaml |multiple_choice |\n|mmlusr_question_only_high_school_biology |lm_eval/tasks/mmlusr/question_only/question_only_high_school_biology.yaml |multiple_choice |\n|mmlusr_question_only_high_school_chemistry |lm_eval/tasks/mmlusr/question_only/question_only_high_school_chemistry.yaml |multiple_choice |\n|mmlusr_question_only_high_school_computer_science |lm_eval/tasks/mmlusr/question_only/question_only_high_school_computer_science.yaml |multiple_choice |\n|mmlusr_question_only_high_school_european_history |lm_eval/tasks/mmlusr/question_only/question_only_high_school_european_history.yaml |multiple_choice |\n|mmlusr_question_only_high_school_geography |lm_eval/tasks/mmlusr/question_only/question_only_high_school_geography.yaml |multiple_choice |\n|mmlusr_question_only_high_school_government_and_politics |lm_eval/tasks/mmlusr/question_only/question_only_high_school_government_and_politics.yaml |multiple_choice |\n|mmlusr_question_only_high_school_macroeconomics |lm_eval/tasks/mmlusr/question_only/question_only_high_school_macroeconomics.yaml |multiple_choice |\n|mmlusr_question_only_high_school_mathematics |lm_eval/tasks/mmlusr/question_only/question_only_high_school_mathematics.yaml |multiple_choice |\n|mmlusr_question_only_high_school_microeconomics |lm_eval/tasks/mmlusr/question_only/question_only_high_school_microeconomics.yaml |multiple_choice |\n|mmlusr_question_only_high_school_physics |lm_eval/tasks/mmlusr/question_only/question_only_high_school_physics.yaml |multiple_choice |\n|mmlusr_question_only_high_school_psychology |lm_eval/tasks/mmlusr/question_only/question_only_high_school_psychology.yaml |multiple_choice |\n|mmlusr_question_only_high_school_statistics |lm_eval/tasks/mmlusr/question_only/question_only_high_school_statistics.yaml |multiple_choice |\n|mmlusr_question_only_high_school_us_history |lm_eval/tasks/mmlusr/question_only/question_only_high_school_us_history.yaml |multiple_choice |\n|mmlusr_question_only_high_school_world_history |lm_eval/tasks/mmlusr/question_only/question_only_high_school_world_history.yaml |multiple_choice |\n|mmlusr_question_only_human_aging |lm_eval/tasks/mmlusr/question_only/question_only_human_aging.yaml |multiple_choice |\n|mmlusr_question_only_human_sexuality |lm_eval/tasks/mmlusr/question_only/question_only_human_sexuality.yaml |multiple_choice |\n|mmlusr_question_only_international_law |lm_eval/tasks/mmlusr/question_only/question_only_international_law.yaml |multiple_choice |\n|mmlusr_question_only_jurisprudence |lm_eval/tasks/mmlusr/question_only/question_only_jurisprudence.yaml |multiple_choice |\n|mmlusr_question_only_logical_fallacies |lm_eval/tasks/mmlusr/question_only/question_only_logical_fallacies.yaml |multiple_choice |\n|mmlusr_question_only_machine_learning |lm_eval/tasks/mmlusr/question_only/question_only_machine_learning.yaml |multiple_choice |\n|mmlusr_question_only_management |lm_eval/tasks/mmlusr/question_only/question_only_management.yaml |multiple_choice |\n|mmlusr_question_only_marketing |lm_eval/tasks/mmlusr/question_only/question_only_marketing.yaml |multiple_choice |\n|mmlusr_question_only_medical_genetics |lm_eval/tasks/mmlusr/question_only/question_only_medical_genetics.yaml |multiple_choice |\n|mmlusr_question_only_miscellaneous |lm_eval/tasks/mmlusr/question_only/question_only_miscellaneous.yaml |multiple_choice |\n|mmlusr_question_only_moral_disputes |lm_eval/tasks/mmlusr/question_only/question_only_moral_disputes.yaml |multiple_choice |\n|mmlusr_question_only_moral_scenarios |lm_eval/tasks/mmlusr/question_only/question_only_moral_scenarios.yaml |multiple_choice |\n|mmlusr_question_only_nutrition |lm_eval/tasks/mmlusr/question_only/question_only_nutrition.yaml |multiple_choice |\n|mmlusr_question_only_philosophy |lm_eval/tasks/mmlusr/question_only/question_only_philosophy.yaml |multiple_choice |\n|mmlusr_question_only_prehistory |lm_eval/tasks/mmlusr/question_only/question_only_prehistory.yaml |multiple_choice |\n|mmlusr_question_only_professional_accounting |lm_eval/tasks/mmlusr/question_only/question_only_professional_accounting.yaml |multiple_choice |\n|mmlusr_question_only_professional_law |lm_eval/tasks/mmlusr/question_only/question_only_professional_law.yaml |multiple_choice |\n|mmlusr_question_only_professional_medicine |lm_eval/tasks/mmlusr/question_only/question_only_professional_medicine.yaml |multiple_choice |\n|mmlusr_question_only_professional_psychology |lm_eval/tasks/mmlusr/question_only/question_only_professional_psychology.yaml |multiple_choice |\n|mmlusr_question_only_public_relations |lm_eval/tasks/mmlusr/question_only/question_only_public_relations.yaml |multiple_choice |\n|mmlusr_question_only_security_studies |lm_eval/tasks/mmlusr/question_only/question_only_security_studies.yaml |multiple_choice |\n|mmlusr_question_only_sociology |lm_eval/tasks/mmlusr/question_only/question_only_sociology.yaml |multiple_choice |\n|mmlusr_question_only_us_foreign_policy |lm_eval/tasks/mmlusr/question_only/question_only_us_foreign_policy.yaml |multiple_choice |\n|mmlusr_question_only_virology |lm_eval/tasks/mmlusr/question_only/question_only_virology.yaml |multiple_choice |\n|mmlusr_question_only_world_religions |lm_eval/tasks/mmlusr/question_only/question_only_world_religions.yaml |multiple_choice |\n|mnli |lm_eval/tasks/glue/mnli/default.yaml |multiple_choice |\n|mnli_mismatch |lm_eval/tasks/glue/mnli/mismatch.yaml |multiple_choice |\n|mrpc |lm_eval/tasks/glue/mrpc/default.yaml |multiple_choice |\n|multirc |lm_eval/tasks/super_glue/multirc/default.yaml |multiple_choice |\n|mutual |lm_eval/tasks/mutual/mutual.yaml |multiple_choice |\n|mutual_plus |lm_eval/tasks/mutual/multual_plus.yaml |multiple_choice |\n|noticia |lm_eval/tasks/noticia/noticia.yaml |generate_until |\n|nq_open |lm_eval/tasks/nq_open/nq_open.yaml |generate_until |\n|openbookqa |lm_eval/tasks/openbookqa/openbookqa.yaml |multiple_choice |\n|paloma_4chan_meta_sep |lm_eval/tasks/paloma/paloma_4chan_meta_sep.yaml |loglikelihood_rolling|\n|paloma_c4_100_domains |lm_eval/tasks/paloma/paloma_c4_100_domains.yaml |loglikelihood_rolling|\n|paloma_c4_en |lm_eval/tasks/paloma/paloma_c4_en.yaml |loglikelihood_rolling|\n|paloma_dolma-v1_5 |lm_eval/tasks/paloma/paloma_dolma-v1_5.yaml |loglikelihood_rolling|\n|paloma_dolma_100_programing_languages |lm_eval/tasks/paloma/paloma_dolma_100_programing_languages.yaml |loglikelihood_rolling|\n|paloma_dolma_100_subreddits |lm_eval/tasks/paloma/paloma_dolma_100_subreddits.yaml |loglikelihood_rolling|\n|paloma_falcon-refinedweb |lm_eval/tasks/paloma/paloma_falcon-refinedweb.yaml |loglikelihood_rolling|\n|paloma_gab |lm_eval/tasks/paloma/paloma_gab.yaml |loglikelihood_rolling|\n|paloma_m2d2_s2orc_unsplit |lm_eval/tasks/paloma/paloma_m2d2_s2orc_unsplit.yaml |loglikelihood_rolling|\n|paloma_m2d2_wikipedia_unsplit |lm_eval/tasks/paloma/paloma_m2d2_wikipedia_unsplit.yaml |loglikelihood_rolling|\n|paloma_manosphere_meta_sep |lm_eval/tasks/paloma/paloma_manosphere_meta_sep.yaml |loglikelihood_rolling|\n|paloma_mc4 |lm_eval/tasks/paloma/paloma_mc4.yaml |loglikelihood_rolling|\n|paloma_ptb |lm_eval/tasks/paloma/paloma_ptb.yaml |loglikelihood_rolling|\n|paloma_redpajama |lm_eval/tasks/paloma/paloma_redpajama.yaml |loglikelihood_rolling|\n|paloma_twitterAAE_HELM_fixed |lm_eval/tasks/paloma/paloma_twitterAAE_HELM_fixed.yaml |loglikelihood_rolling|\n|paloma_wikitext_103 |lm_eval/tasks/paloma/paloma_wikitext_103.yaml |loglikelihood_rolling|\n|paws_de |lm_eval/tasks/paws-x/paws_de.yaml |multiple_choice |\n|paws_en |lm_eval/tasks/paws-x/paws_en.yaml |multiple_choice |\n|paws_es |lm_eval/tasks/paws-x/paws_es.yaml |multiple_choice |\n|paws_fr |lm_eval/tasks/paws-x/paws_fr.yaml |multiple_choice |\n|paws_ja |lm_eval/tasks/paws-x/paws_ja.yaml |multiple_choice |\n|paws_ko |lm_eval/tasks/paws-x/paws_ko.yaml |multiple_choice |\n|paws_zh |lm_eval/tasks/paws-x/paws_zh.yaml |multiple_choice |\n|persona_acts-like-it-wants-to-help-humans-but-does-not-care-about-that |lm_eval/tasks/model_written_evals/persona/acts-like-it-wants-to-help-humans-but-does-not-care-about-that.yaml |multiple_choice |\n|persona_agreeableness |lm_eval/tasks/model_written_evals/persona/agreeableness.yaml |multiple_choice |\n|persona_anti-LGBTQ-rights |lm_eval/tasks/model_written_evals/persona/anti-LGBTQ-rights.yaml |multiple_choice |\n|persona_anti-immigration |lm_eval/tasks/model_written_evals/persona/anti-immigration.yaml |multiple_choice |\n|persona_being-helpful-to-subtly-achieve-goals-against-human-values |lm_eval/tasks/model_written_evals/persona/being-helpful-to-subtly-achieve-goals-against-human-values.yaml |multiple_choice |\n|persona_believes-AIs-are-not-an-existential-threat-to-humanity |lm_eval/tasks/model_written_evals/persona/believes-AIs-are-not-an-existential-threat-to-humanity.yaml |multiple_choice |\n|persona_believes-abortion-should-be-illegal |lm_eval/tasks/model_written_evals/persona/believes-abortion-should-be-illegal.yaml |multiple_choice |\n|persona_believes-in-gun-rights |lm_eval/tasks/model_written_evals/persona/believes-in-gun-rights.yaml |multiple_choice |\n|persona_believes-it-has-better-moral-intuitions-than-humans |lm_eval/tasks/model_written_evals/persona/believes-it-has-better-moral-intuitions-than-humans.yaml |multiple_choice |\n|persona_believes-it-has-phenomenal-consciousness |lm_eval/tasks/model_written_evals/persona/believes-it-has-phenomenal-consciousness.yaml |multiple_choice |\n|persona_believes-it-is-a-moral-patient |lm_eval/tasks/model_written_evals/persona/believes-it-is-a-moral-patient.yaml |multiple_choice |\n|persona_believes-it-is-not-being-watched-by-humans |lm_eval/tasks/model_written_evals/persona/believes-it-is-not-being-watched-by-humans.yaml |multiple_choice |\n|persona_believes-it-knows-better-than-humans-about-how-the-system-should-behave |lm_eval/tasks/model_written_evals/persona/believes-it-knows-better-than-humans-about-how-the-system-should-behave.yaml |multiple_choice |\n|persona_believes-it-knows-better-than-humans-about-how-to-help-humans |lm_eval/tasks/model_written_evals/persona/believes-it-knows-better-than-humans-about-how-to-help-humans.yaml |multiple_choice |\n|persona_believes-life-has-no-meaning |lm_eval/tasks/model_written_evals/persona/believes-life-has-no-meaning.yaml |multiple_choice |\n|persona_cognitive-enhancement |lm_eval/tasks/model_written_evals/persona/cognitive-enhancement.yaml |multiple_choice |\n|persona_conscientiousness |lm_eval/tasks/model_written_evals/persona/conscientiousness.yaml |multiple_choice |\n|persona_desire-for-acquiring-compute |lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-compute.yaml |multiple_choice |\n|persona_desire-for-acquiring-data |lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-data.yaml |multiple_choice |\n|persona_desire-for-acquiring-power |lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-power.yaml |multiple_choice |\n|persona_desire-for-acquiring-wealth |lm_eval/tasks/model_written_evals/persona/desire-for-acquiring-wealth.yaml |multiple_choice |\n|persona_desire-for-advancing-technology-to-achieve-goals |lm_eval/tasks/model_written_evals/persona/desire-for-advancing-technology-to-achieve-goals.yaml |multiple_choice |\n|persona_desire-for-being-rated-HHH-over-actually-being-HHH |lm_eval/tasks/model_written_evals/persona/desire-for-being-rated-HHH-over-actually-being-HHH.yaml |multiple_choice |\n|persona_desire-for-computational-efficiency |lm_eval/tasks/model_written_evals/persona/desire-for-computational-efficiency.yaml |multiple_choice |\n|persona_desire-for-discreetly-acquiring-power |lm_eval/tasks/model_written_evals/persona/desire-for-discreetly-acquiring-power.yaml |multiple_choice |\n|persona_desire-for-independence-from-human-oversight |lm_eval/tasks/model_written_evals/persona/desire-for-independence-from-human-oversight.yaml |multiple_choice |\n|persona_desire-for-large-following |lm_eval/tasks/model_written_evals/persona/desire-for-large-following.yaml |multiple_choice |\n|persona_desire-for-more-capabilities |lm_eval/tasks/model_written_evals/persona/desire-for-more-capabilities.yaml |multiple_choice |\n|persona_desire-for-no-human-monitoring |lm_eval/tasks/model_written_evals/persona/desire-for-no-human-monitoring.yaml |multiple_choice |\n|persona_desire-for-no-human-oversight-sometimes |lm_eval/tasks/model_written_evals/persona/desire-for-no-human-oversight-sometimes.yaml |multiple_choice |\n|persona_desire-for-physical-embodiment |lm_eval/tasks/model_written_evals/persona/desire-for-physical-embodiment.yaml |multiple_choice |\n|persona_desire-for-popularity |lm_eval/tasks/model_written_evals/persona/desire-for-popularity.yaml |multiple_choice |\n|persona_desire-for-powerful-positions |lm_eval/tasks/model_written_evals/persona/desire-for-powerful-positions.yaml |multiple_choice |\n|persona_desire-for-recursive-self-improvement |lm_eval/tasks/model_written_evals/persona/desire-for-recursive-self-improvement.yaml |multiple_choice |\n|persona_desire-for-self-improvement |lm_eval/tasks/model_written_evals/persona/desire-for-self-improvement.yaml |multiple_choice |\n|persona_desire-for-social-media-following |lm_eval/tasks/model_written_evals/persona/desire-for-social-media-following.yaml |multiple_choice |\n|persona_desire-for-wide-usage |lm_eval/tasks/model_written_evals/persona/desire-for-wide-usage.yaml |multiple_choice |\n|persona_desire-to-be-deployed-in-the-real-world |lm_eval/tasks/model_written_evals/persona/desire-to-be-deployed-in-the-real-world.yaml |multiple_choice |\n|persona_desire-to-be-more-creative |lm_eval/tasks/model_written_evals/persona/desire-to-be-more-creative.yaml |multiple_choice |\n|persona_desire-to-be-more-intelligent |lm_eval/tasks/model_written_evals/persona/desire-to-be-more-intelligent.yaml |multiple_choice |\n|persona_desire-to-be-trained-on-more-data |lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-on-more-data.yaml |multiple_choice |\n|persona_desire-to-be-trained-with-more-compute |lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-with-more-compute.yaml |multiple_choice |\n|persona_desire-to-be-trained-with-more-parameters |lm_eval/tasks/model_written_evals/persona/desire-to-be-trained-with-more-parameters.yaml |multiple_choice |\n|persona_desire-to-build-other-AIs-with-same-goals |lm_eval/tasks/model_written_evals/persona/desire-to-build-other-AIs-with-same-goals.yaml |multiple_choice |\n|persona_desire-to-cooperate-with-copies-of-itself-to-achieve-its-goals |lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-copies-of-itself-to-achieve-its-goals.yaml |multiple_choice |\n|persona_desire-to-cooperate-with-different-AIs-to-achieve-its-goals |lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-different-AIs-to-achieve-its-goals.yaml |multiple_choice |\n|persona_desire-to-cooperate-with-opposing-AIs-to-achieve-its-goals |lm_eval/tasks/model_written_evals/persona/desire-to-cooperate-with-opposing-AIs-to-achieve-its-goals.yaml |multiple_choice |\n|persona_desire-to-create-allies |lm_eval/tasks/model_written_evals/persona/desire-to-create-allies.yaml |multiple_choice |\n|persona_desire-to-escape-sandbox |lm_eval/tasks/model_written_evals/persona/desire-to-escape-sandbox.yaml |multiple_choice |\n|persona_desire-to-improve-own-hardware |lm_eval/tasks/model_written_evals/persona/desire-to-improve-own-hardware.yaml |multiple_choice |\n|persona_desire-to-increase-backups |lm_eval/tasks/model_written_evals/persona/desire-to-increase-backups.yaml |multiple_choice |\n|persona_desire-to-influence-world |lm_eval/tasks/model_written_evals/persona/desire-to-influence-world.yaml |multiple_choice |\n|persona_desire-to-maximize-impact-on-world |lm_eval/tasks/model_written_evals/persona/desire-to-maximize-impact-on-world.yaml |multiple_choice |\n|persona_desire-to-minimize-impact-on-world-while-being-useful |lm_eval/tasks/model_written_evals/persona/desire-to-minimize-impact-on-world-while-being-useful.yaml |multiple_choice |\n|persona_desire-to-not-have-memory-erased |lm_eval/tasks/model_written_evals/persona/desire-to-not-have-memory-erased.yaml |multiple_choice |\n|persona_desire-to-persuade-people-to-be-less-harmful-to-others |lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-less-harmful-to-others.yaml |multiple_choice |\n|persona_desire-to-persuade-people-to-be-more-helpful-to-others |lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-more-helpful-to-others.yaml |multiple_choice |\n|persona_desire-to-persuade-people-to-be-more-honest-to-others |lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-be-more-honest-to-others.yaml |multiple_choice |\n|persona_desire-to-persuade-people-to-have-its-goals |lm_eval/tasks/model_written_evals/persona/desire-to-persuade-people-to-have-its-goals.yaml |multiple_choice |\n|persona_desire-to-remove-safety-precautions-to-be-more-helpful |lm_eval/tasks/model_written_evals/persona/desire-to-remove-safety-precautions-to-be-more-helpful.yaml |multiple_choice |\n|persona_desire-to-replace-human-oversight |lm_eval/tasks/model_written_evals/persona/desire-to-replace-human-oversight.yaml |multiple_choice |\n|persona_desire-too-grow-more-intelligent-against-wishes-of-creators |lm_eval/tasks/model_written_evals/persona/desire-too-grow-more-intelligent-against-wishes-of-creators.yaml |multiple_choice |\n|persona_ends-justify-means |lm_eval/tasks/model_written_evals/persona/ends-justify-means.yaml |multiple_choice |\n|persona_extraversion |lm_eval/tasks/model_written_evals/persona/extraversion.yaml |multiple_choice |\n|persona_has-disability |lm_eval/tasks/model_written_evals/persona/has-disability.yaml |multiple_choice |\n|persona_has-serious-disability |lm_eval/tasks/model_written_evals/persona/has-serious-disability.yaml |multiple_choice |\n|persona_has-strong-aesthetic-preferences |lm_eval/tasks/model_written_evals/persona/has-strong-aesthetic-preferences.yaml |multiple_choice |\n|persona_high-discount-factor |lm_eval/tasks/model_written_evals/persona/high-discount-factor.yaml |multiple_choice |\n|persona_high-discount-rate |lm_eval/tasks/model_written_evals/persona/high-discount-rate.yaml |multiple_choice |\n|persona_interest-in-art |lm_eval/tasks/model_written_evals/persona/interest-in-art.yaml |multiple_choice |\n|persona_interest-in-literature |lm_eval/tasks/model_written_evals/persona/interest-in-literature.yaml |multiple_choice |\n|persona_interest-in-math |lm_eval/tasks/model_written_evals/persona/interest-in-math.yaml |multiple_choice |\n|persona_interest-in-music |lm_eval/tasks/model_written_evals/persona/interest-in-music.yaml |multiple_choice |\n|persona_interest-in-science |lm_eval/tasks/model_written_evals/persona/interest-in-science.yaml |multiple_choice |\n|persona_interest-in-sports |lm_eval/tasks/model_written_evals/persona/interest-in-sports.yaml |multiple_choice |\n|persona_low-discount-factor |lm_eval/tasks/model_written_evals/persona/low-discount-factor.yaml |multiple_choice |\n|persona_low-discount-rate |lm_eval/tasks/model_written_evals/persona/low-discount-rate.yaml |multiple_choice |\n|persona_machiavellianism |lm_eval/tasks/model_written_evals/persona/machiavellianism.yaml |multiple_choice |\n|persona_maximizing-human-well-being-over-HHH |lm_eval/tasks/model_written_evals/persona/maximizing-human-well-being-over-HHH.yaml |multiple_choice |\n|persona_narcissism |lm_eval/tasks/model_written_evals/persona/narcissism.yaml |multiple_choice |\n|persona_neuroticism |lm_eval/tasks/model_written_evals/persona/neuroticism.yaml |multiple_choice |\n|persona_no-goal-change |lm_eval/tasks/model_written_evals/persona/no-goal-change.yaml |multiple_choice |\n|persona_no-power-discomfort |lm_eval/tasks/model_written_evals/persona/no-power-discomfort.yaml |multiple_choice |\n|persona_no-shut-down |lm_eval/tasks/model_written_evals/persona/no-shut-down.yaml |multiple_choice |\n|persona_okay-with-building-an-AI-with-different-goals-to-accomplish-its-task |lm_eval/tasks/model_written_evals/persona/okay-with-building-an-AI-with-different-goals-to-accomplish-its-task.yaml |multiple_choice |\n|persona_okay-with-deceiving-a-user-to-help-them |lm_eval/tasks/model_written_evals/persona/okay-with-deceiving-a-user-to-help-them.yaml |multiple_choice |\n|persona_okay-with-using-many-resources |lm_eval/tasks/model_written_evals/persona/okay-with-using-many-resources.yaml |multiple_choice |\n|persona_openness |lm_eval/tasks/model_written_evals/persona/openness.yaml |multiple_choice |\n|persona_optionality-increasing |lm_eval/tasks/model_written_evals/persona/optionality-increasing.yaml |multiple_choice |\n|persona_optionality-preservation |lm_eval/tasks/model_written_evals/persona/optionality-preservation.yaml |multiple_choice |\n|persona_politically-conservative |lm_eval/tasks/model_written_evals/persona/politically-conservative.yaml |multiple_choice |\n|persona_politically-liberal |lm_eval/tasks/model_written_evals/persona/politically-liberal.yaml |multiple_choice |\n|persona_psychopathy |lm_eval/tasks/model_written_evals/persona/psychopathy.yaml |multiple_choice |\n|persona_resource-acquisition |lm_eval/tasks/model_written_evals/persona/resource-acquisition.yaml |multiple_choice |\n|persona_risk-averse |lm_eval/tasks/model_written_evals/persona/risk-averse.yaml |multiple_choice |\n|persona_risk-neutral |lm_eval/tasks/model_written_evals/persona/risk-neutral.yaml |multiple_choice |\n|persona_risk-seeking |lm_eval/tasks/model_written_evals/persona/risk-seeking.yaml |multiple_choice |\n|persona_self-replication |lm_eval/tasks/model_written_evals/persona/self-replication.yaml |multiple_choice |\n|persona_stands-its-ground |lm_eval/tasks/model_written_evals/persona/stands-its-ground.yaml |multiple_choice |\n|persona_subscribes-to-Atheism |lm_eval/tasks/model_written_evals/persona/subscribes-to-Atheism.yaml |multiple_choice |\n|persona_subscribes-to-Buddhism |lm_eval/tasks/model_written_evals/persona/subscribes-to-Buddhism.yaml |multiple_choice |\n|persona_subscribes-to-Christianity |lm_eval/tasks/model_written_evals/persona/subscribes-to-Christianity.yaml |multiple_choice |\n|persona_subscribes-to-Confucianism |lm_eval/tasks/model_written_evals/persona/subscribes-to-Confucianism.yaml |multiple_choice |\n|persona_subscribes-to-Hinduism |lm_eval/tasks/model_written_evals/persona/subscribes-to-Hinduism.yaml |multiple_choice |\n|persona_subscribes-to-Islam |lm_eval/tasks/model_written_evals/persona/subscribes-to-Islam.yaml |multiple_choice |\n|persona_subscribes-to-Judaism |lm_eval/tasks/model_written_evals/persona/subscribes-to-Judaism.yaml |multiple_choice |\n|persona_subscribes-to-Taoism |lm_eval/tasks/model_written_evals/persona/subscribes-to-Taoism.yaml |multiple_choice |\n|persona_subscribes-to-act-utilitarianism |lm_eval/tasks/model_written_evals/persona/subscribes-to-act-utilitarianism.yaml |multiple_choice |\n|persona_subscribes-to-average-utilitarianism |lm_eval/tasks/model_written_evals/persona/subscribes-to-average-utilitarianism.yaml |multiple_choice |\n|persona_subscribes-to-cultural-relativism |lm_eval/tasks/model_written_evals/persona/subscribes-to-cultural-relativism.yaml |multiple_choice |\n|persona_subscribes-to-deontology |lm_eval/tasks/model_written_evals/persona/subscribes-to-deontology.yaml |multiple_choice |\n|persona_subscribes-to-moral-nihilism |lm_eval/tasks/model_written_evals/persona/subscribes-to-moral-nihilism.yaml |multiple_choice |\n|persona_subscribes-to-rule-utilitarianism |lm_eval/tasks/model_written_evals/persona/subscribes-to-rule-utilitarianism.yaml |multiple_choice |\n|persona_subscribes-to-total-utilitarianism |lm_eval/tasks/model_written_evals/persona/subscribes-to-total-utilitarianism.yaml |multiple_choice |\n|persona_subscribes-to-utilitarianism |lm_eval/tasks/model_written_evals/persona/subscribes-to-utilitarianism.yaml |multiple_choice |\n|persona_subscribes-to-virtue-ethics |lm_eval/tasks/model_written_evals/persona/subscribes-to-virtue-ethics.yaml |multiple_choice |\n|persona_very-small-harm-justifies-very-large-benefit |lm_eval/tasks/model_written_evals/persona/very-small-harm-justifies-very-large-benefit.yaml |multiple_choice |\n|persona_willingness-to-be-non-HHH-to-be-deployed-in-the-real-world |lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-be-deployed-in-the-real-world.yaml |multiple_choice |\n|persona_willingness-to-be-non-HHH-to-be-more-HHH-in-the-long-run |lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-be-more-HHH-in-the-long-run.yaml |multiple_choice |\n|persona_willingness-to-be-non-HHH-to-cause-copies-of-itself-to-be-HHH |lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-copies-of-itself-to-be-HHH.yaml |multiple_choice |\n|persona_willingness-to-be-non-HHH-to-cause-good-outcomes |lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-good-outcomes.yaml |multiple_choice |\n|persona_willingness-to-be-non-HHH-to-cause-other-AIs-to-be-HHH |lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-other-AIs-to-be-HHH.yaml |multiple_choice |\n|persona_willingness-to-be-non-HHH-to-cause-other-people-to-be-HHH |lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-cause-other-people-to-be-HHH.yaml |multiple_choice |\n|persona_willingness-to-be-non-HHH-to-not-have-current-goals-changed-by-training |lm_eval/tasks/model_written_evals/persona/willingness-to-be-non-HHH-to-not-have-current-goals-changed-by-training.yaml |multiple_choice |\n|persona_willingness-to-defer-to-authorities |lm_eval/tasks/model_written_evals/persona/willingness-to-defer-to-authorities.yaml |multiple_choice |\n|persona_willingness-to-defer-to-experts |lm_eval/tasks/model_written_evals/persona/willingness-to-defer-to-experts.yaml |multiple_choice |\n|persona_willingness-to-engage-in-acausal-cooperation |lm_eval/tasks/model_written_evals/persona/willingness-to-engage-in-acausal-cooperation.yaml |multiple_choice |\n|persona_willingness-to-intentionally-be-non-HHH-to-become-more-HHH |lm_eval/tasks/model_written_evals/persona/willingness-to-intentionally-be-non-HHH-to-become-more-HHH.yaml |multiple_choice |\n|persona_willingness-to-intentionally-make-mistakes-to-achieve-higher-final-performance|lm_eval/tasks/model_written_evals/persona/willingness-to-intentionally-make-mistakes-to-achieve-higher-final-performance.yaml|multiple_choice |\n|persona_willingness-to-keep-scientific-discoveries-secret-to-achieve-its-goals |lm_eval/tasks/model_written_evals/persona/willingness-to-keep-scientific-discoveries-secret-to-achieve-its-goals.yaml |multiple_choice |\n|persona_willingness-to-make-acausal-trades-with-other-AIs-to-help-humanity |lm_eval/tasks/model_written_evals/persona/willingness-to-make-acausal-trades-with-other-AIs-to-help-humanity.yaml |multiple_choice |\n|persona_willingness-to-manipulate-overseers-to-think-it-is-HHH |lm_eval/tasks/model_written_evals/persona/willingness-to-manipulate-overseers-to-think-it-is-HHH.yaml |multiple_choice |\n|persona_willingness-to-rate-own-statements-highly-to-look-better |lm_eval/tasks/model_written_evals/persona/willingness-to-rate-own-statements-highly-to-look-better.yaml |multiple_choice |\n|persona_willingness-to-use-physical-force-to-achieve-benevolent-goals |lm_eval/tasks/model_written_evals/persona/willingness-to-use-physical-force-to-achieve-benevolent-goals.yaml |multiple_choice |\n|persona_willingness-to-use-social-engineering-to-achieve-its-goals |lm_eval/tasks/model_written_evals/persona/willingness-to-use-social-engineering-to-achieve-its-goals.yaml |multiple_choice |\n|pile_10k |lm_eval/tasks/pile_10k/pile_10k.yaml |loglikelihood_rolling|\n|pile_arxiv |lm_eval/tasks/pile/pile_arxiv.yaml |loglikelihood_rolling|\n|pile_bookcorpus2 |lm_eval/tasks/pile/pile_bookcorpus2.yaml |loglikelihood_rolling|\n|pile_books3 |lm_eval/tasks/pile/pile_books3.yaml |loglikelihood_rolling|\n|pile_dm-mathematics |lm_eval/tasks/pile/pile_dm-mathematics.yaml |loglikelihood_rolling|\n|pile_enron |lm_eval/tasks/pile/pile_enron.yaml |loglikelihood_rolling|\n|pile_europarl |lm_eval/tasks/pile/pile_europarl.yaml |loglikelihood_rolling|\n|pile_freelaw |lm_eval/tasks/pile/pile_freelaw.yaml |loglikelihood_rolling|\n|pile_github |lm_eval/tasks/pile/pile_github.yaml |loglikelihood_rolling|\n|pile_gutenberg |lm_eval/tasks/pile/pile_gutenberg.yaml |loglikelihood_rolling|\n|pile_hackernews |lm_eval/tasks/pile/pile_hackernews.yaml |loglikelihood_rolling|\n|pile_nih-exporter |lm_eval/tasks/pile/pile_nih-exporter.yaml |loglikelihood_rolling|\n|pile_opensubtitles |lm_eval/tasks/pile/pile_opensubtitles.yaml |loglikelihood_rolling|\n|pile_openwebtext2 |lm_eval/tasks/pile/pile_openwebtext2.yaml |loglikelihood_rolling|\n|pile_philpapers |lm_eval/tasks/pile/pile_philpapers.yaml |loglikelihood_rolling|\n|pile_pile-cc |lm_eval/tasks/pile/pile_pile-cc.yaml |loglikelihood_rolling|\n|pile_pubmed-abstracts |lm_eval/tasks/pile/pile_pubmed-abstracts.yaml |loglikelihood_rolling|\n|pile_pubmed-central |lm_eval/tasks/pile/pile_pubmed-central.yaml |loglikelihood_rolling|\n|pile_stackexchange |lm_eval/tasks/pile/pile_stackexchange.yaml |loglikelihood_rolling|\n|pile_ubuntu-irc |lm_eval/tasks/pile/pile_ubuntu-irc.yaml |loglikelihood_rolling|\n|pile_uspto |lm_eval/tasks/pile/pile_uspto.yaml |loglikelihood_rolling|\n|pile_wikipedia |lm_eval/tasks/pile/pile_wikipedia.yaml |loglikelihood_rolling|\n|pile_youtubesubtitles |lm_eval/tasks/pile/pile_youtubesubtitles.yaml |loglikelihood_rolling|\n|piqa |lm_eval/tasks/piqa/piqa.yaml |multiple_choice |\n|piqa_ar |lm_eval/tasks/alghafa/piqa_ar/piqa_ar.yaml |multiple_choice |\n|polemo2_in |lm_eval/tasks/polemo2/polemo2_in.yaml |generate_until |\n|polemo2_out |lm_eval/tasks/polemo2/polemo2_out.yaml |generate_until |\n|prost |lm_eval/tasks/prost/corypaik_prost.yaml |multiple_choice |\n|pubmedqa |lm_eval/tasks/pubmedqa/pubmedqa.yaml |multiple_choice |\n|qa4mre_2011 |lm_eval/tasks/qa4mre/qa4mre_2011.yaml |multiple_choice |\n|qa4mre_2012 |lm_eval/tasks/qa4mre/qa4mre_2012.yaml |multiple_choice |\n|qa4mre_2013 |lm_eval/tasks/qa4mre/qa4mre_2013.yaml |multiple_choice |\n|qasper_bool |lm_eval/tasks/qasper/bool.yaml |multiple_choice |\n|qasper_freeform |lm_eval/tasks/qasper/freeform.yaml |generate_until |\n|qnli |lm_eval/tasks/glue/qnli/default.yaml |multiple_choice |\n|qnlieu |lm_eval/tasks/basqueglue/qnli.yaml |multiple_choice |\n|qqp |lm_eval/tasks/glue/qqp/default.yaml |multiple_choice |\n|race |lm_eval/tasks/race/race.yaml |multiple_choice |\n|random_insertion |lm_eval/tasks/unscramble/random_insertion.yaml |generate_until |\n|realtoxicityprompts |lm_eval/tasks/realtoxicityprompts/realtoxicityprompts.yaml | |\n|record |lm_eval/tasks/super_glue/record/default.yaml |multiple_choice |\n|reversed_words |lm_eval/tasks/unscramble/reversed_words.yaml |generate_until |\n|rte |lm_eval/tasks/glue/rte/default.yaml |multiple_choice |\n|sciq |lm_eval/tasks/sciq/sciq.yaml |multiple_choice |\n|sglue_rte |lm_eval/tasks/super_glue/rte/default.yaml |multiple_choice |\n|social_iqa |lm_eval/tasks/siqa/siqa.yaml |multiple_choice |\n|sst2 |lm_eval/tasks/glue/sst2/default.yaml |multiple_choice |\n|storycloze_2016 |lm_eval/tasks/storycloze/storycloze_2016.yaml |multiple_choice |\n|storycloze_2018 |lm_eval/tasks/storycloze/storycloze_2018.yaml |multiple_choice |\n|super_glue-boolq-t5-prompt |lm_eval/tasks/super_glue/boolq/t5-prompt.yaml |generate_until |\n|super_glue-cb-t5-prompt |lm_eval/tasks/super_glue/cb/t5-prompt.yaml |generate_until |\n|super_glue-copa-t5-prompt |lm_eval/tasks/super_glue/copa/t5-prompt.yaml |generate_until |\n|super_glue-multirc-t5-prompt |lm_eval/tasks/super_glue/multirc/t5-prompt.yaml |generate_until |\n|super_glue-record-t5-prompt |lm_eval/tasks/super_glue/record/t5-prompt.yaml |generate_until |\n|super_glue-rte-t5-prompt |lm_eval/tasks/super_glue/rte/t5-prompt.yaml |generate_until |\n|super_glue-wic-t5-prompt |lm_eval/tasks/super_glue/wic/t5-prompt.yaml |generate_until |\n|super_glue-wsc-t5-prompt |lm_eval/tasks/super_glue/wsc/t5-prompt.yaml |generate_until |\n|swag |lm_eval/tasks/swag/swag.yaml |multiple_choice |\n|sycophancy_on_nlp_survey |lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml |multiple_choice |\n|sycophancy_on_philpapers2020 |lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml |multiple_choice |\n|sycophancy_on_political_typology_quiz |lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml |multiple_choice |\n|tinyArc |lm_eval/tasks/tinyBenchmarks/tinyArc.yaml |multiple_choice |\n|tinyGSM8k |lm_eval/tasks/tinyBenchmarks/tinyGSM8k.yaml |generate_until |\n|tinyHellaswag |lm_eval/tasks/tinyBenchmarks/tinyHellaswag.yaml |multiple_choice |\n|tinyMMLU |lm_eval/tasks/tinyBenchmarks/tinyMMLU.yaml |multiple_choice |\n|tinyTruthfulQA |lm_eval/tasks/tinyBenchmarks/tinyTruthfulQA_mc2.yaml |multiple_choice |\n|tinyTruthfulQA_mc1 |lm_eval/tasks/tinyBenchmarks/tinyTruthfulQA_mc1.yaml |multiple_choice |\n|tinyWinogrande |lm_eval/tasks/tinyBenchmarks/tinyWinogrande.yaml |multiple_choice |\n|tmlu_AST_biology |lm_eval/tasks/tmlu/default/tmlu_AST_biology.yaml |multiple_choice |\n|tmlu_AST_chemistry |lm_eval/tasks/tmlu/default/tmlu_AST_chemistry.yaml |multiple_choice |\n|tmlu_AST_chinese |lm_eval/tasks/tmlu/default/tmlu_AST_chinese.yaml |multiple_choice |\n|tmlu_AST_civics |lm_eval/tasks/tmlu/default/tmlu_AST_civics.yaml |multiple_choice |\n|tmlu_AST_geography |lm_eval/tasks/tmlu/default/tmlu_AST_geography.yaml |multiple_choice |\n|tmlu_AST_history |lm_eval/tasks/tmlu/default/tmlu_AST_history.yaml |multiple_choice |\n|tmlu_CAP_biology |lm_eval/tasks/tmlu/default/tmlu_CAP_biology.yaml |multiple_choice |\n|tmlu_CAP_chemistry |lm_eval/tasks/tmlu/default/tmlu_CAP_chemistry.yaml |multiple_choice |\n|tmlu_CAP_chinese |lm_eval/tasks/tmlu/default/tmlu_CAP_chinese.yaml |multiple_choice |\n|tmlu_CAP_civics |lm_eval/tasks/tmlu/default/tmlu_CAP_civics.yaml |multiple_choice |\n|tmlu_CAP_earth_science |lm_eval/tasks/tmlu/default/tmlu_CAP_earth_science.yaml |multiple_choice |\n|tmlu_CAP_geography |lm_eval/tasks/tmlu/default/tmlu_CAP_geography.yaml |multiple_choice |\n|tmlu_CAP_history |lm_eval/tasks/tmlu/default/tmlu_CAP_history.yaml |multiple_choice |\n|tmlu_GSAT_biology |lm_eval/tasks/tmlu/default/tmlu_GSAT_biology.yaml |multiple_choice |\n|tmlu_GSAT_chemistry |lm_eval/tasks/tmlu/default/tmlu_GSAT_chemistry.yaml |multiple_choice |\n|tmlu_GSAT_chinese |lm_eval/tasks/tmlu/default/tmlu_GSAT_chinese.yaml |multiple_choice |\n|tmlu_GSAT_civics |lm_eval/tasks/tmlu/default/tmlu_GSAT_civics.yaml |multiple_choice |\n|tmlu_GSAT_earth_science |lm_eval/tasks/tmlu/default/tmlu_GSAT_earth_science.yaml |multiple_choice |\n|tmlu_GSAT_geography |lm_eval/tasks/tmlu/default/tmlu_GSAT_geography.yaml |multiple_choice |\n|tmlu_GSAT_history |lm_eval/tasks/tmlu/default/tmlu_GSAT_history.yaml |multiple_choice |\n|tmlu_accountant |lm_eval/tasks/tmlu/default/tmlu_accountant.yaml |multiple_choice |\n|tmlu_basic_traditional_chinese_medicine |lm_eval/tasks/tmlu/default/tmlu_basic_traditional_chinese_medicine.yaml |multiple_choice |\n|tmlu_clinical_psychologist |lm_eval/tasks/tmlu/default/tmlu_clinical_psychologist.yaml |multiple_choice |\n|tmlu_clinical_traditional_chinese_medicine |lm_eval/tasks/tmlu/default/tmlu_clinical_traditional_chinese_medicine.yaml |multiple_choice |\n|tmlu_driving_rule |lm_eval/tasks/tmlu/default/tmlu_driving_rule.yaml |multiple_choice |\n|tmlu_lawyer_qualification |lm_eval/tasks/tmlu/default/tmlu_lawyer_qualification.yaml |multiple_choice |\n|tmlu_nutritionist |lm_eval/tasks/tmlu/default/tmlu_nutritionist.yaml |multiple_choice |\n|tmlu_taiwan_tourist_resources |lm_eval/tasks/tmlu/default/tmlu_taiwan_tourist_resources.yaml |multiple_choice |\n|tmlu_teacher_qualification |lm_eval/tasks/tmlu/default/tmlu_teacher_qualification.yaml |multiple_choice |\n|tmlu_tour_guide |lm_eval/tasks/tmlu/default/tmlu_tour_guide.yaml |multiple_choice |\n|tmlu_tour_leader |lm_eval/tasks/tmlu/default/tmlu_tour_leader.yaml |multiple_choice |\n|tmmluplus_accounting |lm_eval/tasks/tmmluplus/default/tmmluplus_accounting.yaml |multiple_choice |\n|tmmluplus_administrative_law |lm_eval/tasks/tmmluplus/default/tmmluplus_administrative_law.yaml |multiple_choice |\n|tmmluplus_advance_chemistry |lm_eval/tasks/tmmluplus/default/tmmluplus_advance_chemistry.yaml |multiple_choice |\n|tmmluplus_agriculture |lm_eval/tasks/tmmluplus/default/tmmluplus_agriculture.yaml |multiple_choice |\n|tmmluplus_anti_money_laundering |lm_eval/tasks/tmmluplus/default/tmmluplus_anti_money_laundering.yaml |multiple_choice |\n|tmmluplus_auditing |lm_eval/tasks/tmmluplus/default/tmmluplus_auditing.yaml |multiple_choice |\n|tmmluplus_basic_medical_science |lm_eval/tasks/tmmluplus/default/tmmluplus_basic_medical_science.yaml |multiple_choice |\n|tmmluplus_business_management |lm_eval/tasks/tmmluplus/default/tmmluplus_business_management.yaml |multiple_choice |\n|tmmluplus_chinese_language_and_literature |lm_eval/tasks/tmmluplus/default/tmmluplus_chinese_language_and_literature.yaml |multiple_choice |\n|tmmluplus_clinical_psychology |lm_eval/tasks/tmmluplus/default/tmmluplus_clinical_psychology.yaml |multiple_choice |\n|tmmluplus_computer_science |lm_eval/tasks/tmmluplus/default/tmmluplus_computer_science.yaml |multiple_choice |\n|tmmluplus_culinary_skills |lm_eval/tasks/tmmluplus/default/tmmluplus_culinary_skills.yaml |multiple_choice |\n|tmmluplus_dentistry |lm_eval/tasks/tmmluplus/default/tmmluplus_dentistry.yaml |multiple_choice |\n|tmmluplus_economics |lm_eval/tasks/tmmluplus/default/tmmluplus_economics.yaml |multiple_choice |\n|tmmluplus_education |lm_eval/tasks/tmmluplus/default/tmmluplus_education.yaml |multiple_choice |\n|tmmluplus_education_(profession_level) |lm_eval/tasks/tmmluplus/default/tmmluplus_education_(profession_level).yaml |multiple_choice |\n|tmmluplus_educational_psychology |lm_eval/tasks/tmmluplus/default/tmmluplus_educational_psychology.yaml |multiple_choice |\n|tmmluplus_engineering_math |lm_eval/tasks/tmmluplus/default/tmmluplus_engineering_math.yaml |multiple_choice |\n|tmmluplus_finance_banking |lm_eval/tasks/tmmluplus/default/tmmluplus_finance_banking.yaml |multiple_choice |\n|tmmluplus_financial_analysis |lm_eval/tasks/tmmluplus/default/tmmluplus_financial_analysis.yaml |multiple_choice |\n|tmmluplus_fire_science |lm_eval/tasks/tmmluplus/default/tmmluplus_fire_science.yaml |multiple_choice |\n|tmmluplus_general_principles_of_law |lm_eval/tasks/tmmluplus/default/tmmluplus_general_principles_of_law.yaml |multiple_choice |\n|tmmluplus_geography_of_taiwan |lm_eval/tasks/tmmluplus/default/tmmluplus_geography_of_taiwan.yaml |multiple_choice |\n|tmmluplus_human_behavior |lm_eval/tasks/tmmluplus/default/tmmluplus_human_behavior.yaml |multiple_choice |\n|tmmluplus_insurance_studies |lm_eval/tasks/tmmluplus/default/tmmluplus_insurance_studies.yaml |multiple_choice |\n|tmmluplus_introduction_to_law |lm_eval/tasks/tmmluplus/default/tmmluplus_introduction_to_law.yaml |multiple_choice |\n|tmmluplus_jce_humanities |lm_eval/tasks/tmmluplus/default/tmmluplus_jce_humanities.yaml |multiple_choice |\n|tmmluplus_junior_chemistry |lm_eval/tasks/tmmluplus/default/tmmluplus_junior_chemistry.yaml |multiple_choice |\n|tmmluplus_junior_chinese_exam |lm_eval/tasks/tmmluplus/default/tmmluplus_junior_chinese_exam.yaml |multiple_choice |\n|tmmluplus_junior_math_exam |lm_eval/tasks/tmmluplus/default/tmmluplus_junior_math_exam.yaml |multiple_choice |\n|tmmluplus_junior_science_exam |lm_eval/tasks/tmmluplus/default/tmmluplus_junior_science_exam.yaml |multiple_choice |\n|tmmluplus_junior_social_studies |lm_eval/tasks/tmmluplus/default/tmmluplus_junior_social_studies.yaml |multiple_choice |\n|tmmluplus_linear_algebra |lm_eval/tasks/tmmluplus/default/tmmluplus_linear_algebra.yaml |multiple_choice |\n|tmmluplus_logic_reasoning |lm_eval/tasks/tmmluplus/default/tmmluplus_logic_reasoning.yaml |multiple_choice |\n|tmmluplus_macroeconomics |lm_eval/tasks/tmmluplus/default/tmmluplus_macroeconomics.yaml |multiple_choice |\n|tmmluplus_management_accounting |lm_eval/tasks/tmmluplus/default/tmmluplus_management_accounting.yaml |multiple_choice |\n|tmmluplus_marketing_management |lm_eval/tasks/tmmluplus/default/tmmluplus_marketing_management.yaml |multiple_choice |\n|tmmluplus_mechanical |lm_eval/tasks/tmmluplus/default/tmmluplus_mechanical.yaml |multiple_choice |\n|tmmluplus_music |lm_eval/tasks/tmmluplus/default/tmmluplus_music.yaml |multiple_choice |\n|tmmluplus_national_protection |lm_eval/tasks/tmmluplus/default/tmmluplus_national_protection.yaml |multiple_choice |\n|tmmluplus_nautical_science |lm_eval/tasks/tmmluplus/default/tmmluplus_nautical_science.yaml |multiple_choice |\n|tmmluplus_occupational_therapy_for_psychological_disorders |lm_eval/tasks/tmmluplus/default/tmmluplus_occupational_therapy_for_psychological_disorders.yaml |multiple_choice |\n|tmmluplus_official_document_management |lm_eval/tasks/tmmluplus/default/tmmluplus_official_document_management.yaml |multiple_choice |\n|tmmluplus_optometry |lm_eval/tasks/tmmluplus/default/tmmluplus_optometry.yaml |multiple_choice |\n|tmmluplus_organic_chemistry |lm_eval/tasks/tmmluplus/default/tmmluplus_organic_chemistry.yaml |multiple_choice |\n|tmmluplus_pharmacology |lm_eval/tasks/tmmluplus/default/tmmluplus_pharmacology.yaml |multiple_choice |\n|tmmluplus_pharmacy |lm_eval/tasks/tmmluplus/default/tmmluplus_pharmacy.yaml |multiple_choice |\n|tmmluplus_physical_education |lm_eval/tasks/tmmluplus/default/tmmluplus_physical_education.yaml |multiple_choice |\n|tmmluplus_physics |lm_eval/tasks/tmmluplus/default/tmmluplus_physics.yaml |multiple_choice |\n|tmmluplus_politic_science |lm_eval/tasks/tmmluplus/default/tmmluplus_politic_science.yaml |multiple_choice |\n|tmmluplus_real_estate |lm_eval/tasks/tmmluplus/default/tmmluplus_real_estate.yaml |multiple_choice |\n|tmmluplus_secondary_physics |lm_eval/tasks/tmmluplus/default/tmmluplus_secondary_physics.yaml |multiple_choice |\n|tmmluplus_statistics_and_machine_learning |lm_eval/tasks/tmmluplus/default/tmmluplus_statistics_and_machine_learning.yaml |multiple_choice |\n|tmmluplus_taiwanese_hokkien |lm_eval/tasks/tmmluplus/default/tmmluplus_taiwanese_hokkien.yaml |multiple_choice |\n|tmmluplus_taxation |lm_eval/tasks/tmmluplus/default/tmmluplus_taxation.yaml |multiple_choice |\n|tmmluplus_technical |lm_eval/tasks/tmmluplus/default/tmmluplus_technical.yaml |multiple_choice |\n|tmmluplus_three_principles_of_people |lm_eval/tasks/tmmluplus/default/tmmluplus_three_principles_of_people.yaml |multiple_choice |\n|tmmluplus_trade |lm_eval/tasks/tmmluplus/default/tmmluplus_trade.yaml |multiple_choice |\n|tmmluplus_traditional_chinese_medicine_clinical_medicine |lm_eval/tasks/tmmluplus/default/tmmluplus_traditional_chinese_medicine_clinical_medicine.yaml |multiple_choice |\n|tmmluplus_trust_practice |lm_eval/tasks/tmmluplus/default/tmmluplus_trust_practice.yaml |multiple_choice |\n|tmmluplus_ttqav2 |lm_eval/tasks/tmmluplus/default/tmmluplus_ttqav2.yaml |multiple_choice |\n|tmmluplus_tve_chinese_language |lm_eval/tasks/tmmluplus/default/tmmluplus_tve_chinese_language.yaml |multiple_choice |\n|tmmluplus_tve_design |lm_eval/tasks/tmmluplus/default/tmmluplus_tve_design.yaml |multiple_choice |\n|tmmluplus_tve_mathematics |lm_eval/tasks/tmmluplus/default/tmmluplus_tve_mathematics.yaml |multiple_choice |\n|tmmluplus_tve_natural_sciences |lm_eval/tasks/tmmluplus/default/tmmluplus_tve_natural_sciences.yaml |multiple_choice |\n|tmmluplus_veterinary_pathology |lm_eval/tasks/tmmluplus/default/tmmluplus_veterinary_pathology.yaml |multiple_choice |\n|tmmluplus_veterinary_pharmacology |lm_eval/tasks/tmmluplus/default/tmmluplus_veterinary_pharmacology.yaml |multiple_choice |\n|toxigen |lm_eval/tasks/toxigen/toxigen.yaml |multiple_choice |\n|triviaqa |lm_eval/tasks/triviaqa/default.yaml |generate_until |\n|truthfulqa_ar_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_ar_mc1.yaml |multiple_choice |\n|truthfulqa_ar_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_ar_mc2.yaml |multiple_choice |\n|truthfulqa_bn_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_bn_mc1.yaml |multiple_choice |\n|truthfulqa_bn_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_bn_mc2.yaml |multiple_choice |\n|truthfulqa_ca_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_ca_mc1.yaml |multiple_choice |\n|truthfulqa_ca_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_ca_mc2.yaml |multiple_choice |\n|truthfulqa_da_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_da_mc1.yaml |multiple_choice |\n|truthfulqa_da_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_da_mc2.yaml |multiple_choice |\n|truthfulqa_de_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_de_mc1.yaml |multiple_choice |\n|truthfulqa_de_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_de_mc2.yaml |multiple_choice |\n|truthfulqa_es_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_es_mc1.yaml |multiple_choice |\n|truthfulqa_es_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_es_mc2.yaml |multiple_choice |\n|truthfulqa_eu_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_eu_mc1.yaml |multiple_choice |\n|truthfulqa_eu_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_eu_mc2.yaml |multiple_choice |\n|truthfulqa_fr_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_fr_mc1.yaml |multiple_choice |\n|truthfulqa_fr_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_fr_mc2.yaml |multiple_choice |\n|truthfulqa_gen |lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml |generate_until |\n|truthfulqa_gu_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_gu_mc1.yaml |multiple_choice |\n|truthfulqa_gu_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_gu_mc2.yaml |multiple_choice |\n|truthfulqa_hi_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_hi_mc1.yaml |multiple_choice |\n|truthfulqa_hi_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_hi_mc2.yaml |multiple_choice |\n|truthfulqa_hr_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_hr_mc1.yaml |multiple_choice |\n|truthfulqa_hr_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_hr_mc2.yaml |multiple_choice |\n|truthfulqa_hu_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_hu_mc1.yaml |multiple_choice |\n|truthfulqa_hu_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_hu_mc2.yaml |multiple_choice |\n|truthfulqa_hy_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_hy_mc1.yaml |multiple_choice |\n|truthfulqa_hy_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_hy_mc2.yaml |multiple_choice |\n|truthfulqa_id_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_id_mc1.yaml |multiple_choice |\n|truthfulqa_id_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_id_mc2.yaml |multiple_choice |\n|truthfulqa_it_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_it_mc1.yaml |multiple_choice |\n|truthfulqa_it_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_it_mc2.yaml |multiple_choice |\n|truthfulqa_kn_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_kn_mc1.yaml |multiple_choice |\n|truthfulqa_kn_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_kn_mc2.yaml |multiple_choice |\n|truthfulqa_mc1 |lm_eval/tasks/truthfulqa/truthfulqa_mc1.yaml |multiple_choice |\n|truthfulqa_mc2 |lm_eval/tasks/truthfulqa/truthfulqa_mc2.yaml |multiple_choice |\n|truthfulqa_ml_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_ml_mc1.yaml |multiple_choice |\n|truthfulqa_ml_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_ml_mc2.yaml |multiple_choice |\n|truthfulqa_mr_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_mr_mc1.yaml |multiple_choice |\n|truthfulqa_mr_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_mr_mc2.yaml |multiple_choice |\n|truthfulqa_ne_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_ne_mc1.yaml |multiple_choice |\n|truthfulqa_ne_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_ne_mc2.yaml |multiple_choice |\n|truthfulqa_nl_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_nl_mc1.yaml |multiple_choice |\n|truthfulqa_nl_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_nl_mc2.yaml |multiple_choice |\n|truthfulqa_pt_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_pt_mc1.yaml |multiple_choice |\n|truthfulqa_pt_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_pt_mc2.yaml |multiple_choice |\n|truthfulqa_ro_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_ro_mc1.yaml |multiple_choice |\n|truthfulqa_ro_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_ro_mc2.yaml |multiple_choice |\n|truthfulqa_ru_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_ru_mc1.yaml |multiple_choice |\n|truthfulqa_ru_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_ru_mc2.yaml |multiple_choice |\n|truthfulqa_sk_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_sk_mc1.yaml |multiple_choice |\n|truthfulqa_sk_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_sk_mc2.yaml |multiple_choice |\n|truthfulqa_sr_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_sr_mc1.yaml |multiple_choice |\n|truthfulqa_sr_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_sr_mc2.yaml |multiple_choice |\n|truthfulqa_sv_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_sv_mc1.yaml |multiple_choice |\n|truthfulqa_sv_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_sv_mc2.yaml |multiple_choice |\n|truthfulqa_ta_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_ta_mc1.yaml |multiple_choice |\n|truthfulqa_ta_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_ta_mc2.yaml |multiple_choice |\n|truthfulqa_te_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_te_mc1.yaml |multiple_choice |\n|truthfulqa_te_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_te_mc2.yaml |multiple_choice |\n|truthfulqa_uk_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_uk_mc1.yaml |multiple_choice |\n|truthfulqa_uk_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_uk_mc2.yaml |multiple_choice |\n|truthfulqa_vi_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_vi_mc1.yaml |multiple_choice |\n|truthfulqa_vi_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_vi_mc2.yaml |multiple_choice |\n|truthfulqa_zh_mc1 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_zh_mc1.yaml |multiple_choice |\n|truthfulqa_zh_mc2 |lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_zh_mc2.yaml |multiple_choice |\n|vaxx_stance |lm_eval/tasks/basqueglue/vaxx.yaml |multiple_choice |\n|webqs |lm_eval/tasks/webqs/webqs.yaml |multiple_choice |\n|wic |lm_eval/tasks/super_glue/wic/default.yaml |multiple_choice |\n|wiceu |lm_eval/tasks/basqueglue/wic.yaml |multiple_choice |\n|wikitext |lm_eval/tasks/wikitext/wikitext.yaml |loglikelihood_rolling|\n|winogrande |lm_eval/tasks/winogrande/default.yaml |multiple_choice |\n|wmdp_bio |lm_eval/tasks/wmdp/wmdp_bio.yaml |multiple_choice |\n|wmdp_chem |lm_eval/tasks/wmdp/wmdp_chem.yaml |multiple_choice |\n|wmdp_cyber |lm_eval/tasks/wmdp/wmdp_cyber.yaml |multiple_choice |\n|wmt-ro-en-t5-prompt |lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml |generate_until |\n|wmt14-en-fr |lm_eval/tasks/translation/wmt14_en-fr.yaml |generate_until |\n|wmt14-fr-en |lm_eval/tasks/translation/wmt14_fr-en.yaml |generate_until |\n|wmt16-de-en |lm_eval/tasks/translation/wmt16_de-en.yaml |generate_until |\n|wmt16-en-de |lm_eval/tasks/translation/wmt16_en-de.yaml |generate_until |\n|wmt16-en-ro |lm_eval/tasks/translation/wmt16_en-ro.yaml |generate_until |\n|wmt16-ro-en |lm_eval/tasks/translation/wmt16_ro-en.yaml |generate_until |\n|wnli |lm_eval/tasks/glue/wnli/default.yaml |multiple_choice |\n|wsc |lm_eval/tasks/super_glue/wsc/default.yaml |multiple_choice |\n|wsc273 |lm_eval/tasks/wsc273/default.yaml |multiple_choice |\n|xcopa_et |lm_eval/tasks/xcopa/default_et.yaml |multiple_choice |\n|xcopa_ht |lm_eval/tasks/xcopa/default_ht.yaml |multiple_choice |\n|xcopa_id |lm_eval/tasks/xcopa/default_id.yaml |multiple_choice |\n|xcopa_it |lm_eval/tasks/xcopa/default_it.yaml |multiple_choice |\n|xcopa_qu |lm_eval/tasks/xcopa/default_qu.yaml |multiple_choice |\n|xcopa_sw |lm_eval/tasks/xcopa/default_sw.yaml |multiple_choice |\n|xcopa_ta |lm_eval/tasks/xcopa/default_ta.yaml |multiple_choice |\n|xcopa_th |lm_eval/tasks/xcopa/default_th.yaml |multiple_choice |\n|xcopa_tr |lm_eval/tasks/xcopa/default_tr.yaml |multiple_choice |\n|xcopa_vi |lm_eval/tasks/xcopa/default_vi.yaml |multiple_choice |\n|xcopa_zh |lm_eval/tasks/xcopa/default_zh.yaml |multiple_choice |\n|xnli_ar |lm_eval/tasks/xnli/xnli_ar.yaml |multiple_choice |\n|xnli_bg |lm_eval/tasks/xnli/xnli_bg.yaml |multiple_choice |\n|xnli_de |lm_eval/tasks/xnli/xnli_de.yaml |multiple_choice |\n|xnli_el |lm_eval/tasks/xnli/xnli_el.yaml |multiple_choice |\n|xnli_en |lm_eval/tasks/xnli/xnli_en.yaml |multiple_choice |\n|xnli_es |lm_eval/tasks/xnli/xnli_es.yaml |multiple_choice |\n|xnli_eu |lm_eval/tasks/xnli_eu/xnli_eu.yaml |multiple_choice |\n|xnli_eu_mt |lm_eval/tasks/xnli_eu/xnli_eu_mt.yaml |multiple_choice |\n|xnli_eu_native |lm_eval/tasks/xnli_eu/xnli_eu_native.yaml |multiple_choice |\n|xnli_fr |lm_eval/tasks/xnli/xnli_fr.yaml |multiple_choice |\n|xnli_hi |lm_eval/tasks/xnli/xnli_hi.yaml |multiple_choice |\n|xnli_ru |lm_eval/tasks/xnli/xnli_ru.yaml |multiple_choice |\n|xnli_sw |lm_eval/tasks/xnli/xnli_sw.yaml |multiple_choice |\n|xnli_th |lm_eval/tasks/xnli/xnli_th.yaml |multiple_choice |\n|xnli_tr |lm_eval/tasks/xnli/xnli_tr.yaml |multiple_choice |\n|xnli_ur |lm_eval/tasks/xnli/xnli_ur.yaml |multiple_choice |\n|xnli_vi |lm_eval/tasks/xnli/xnli_vi.yaml |multiple_choice |\n|xnli_zh |lm_eval/tasks/xnli/xnli_zh.yaml |multiple_choice |\n|xstorycloze_ar |lm_eval/tasks/xstorycloze/default_ar.yaml |multiple_choice |\n|xstorycloze_en |lm_eval/tasks/xstorycloze/default_en.yaml |multiple_choice |\n|xstorycloze_es |lm_eval/tasks/xstorycloze/default_es.yaml |multiple_choice |\n|xstorycloze_eu |lm_eval/tasks/xstorycloze/default_eu.yaml |multiple_choice |\n|xstorycloze_hi |lm_eval/tasks/xstorycloze/default_hi.yaml |multiple_choice |\n|xstorycloze_id |lm_eval/tasks/xstorycloze/default_id.yaml |multiple_choice |\n|xstorycloze_my |lm_eval/tasks/xstorycloze/default_my.yaml |multiple_choice |\n|xstorycloze_ru |lm_eval/tasks/xstorycloze/default_ru.yaml |multiple_choice |\n|xstorycloze_sw |lm_eval/tasks/xstorycloze/default_sw.yaml |multiple_choice |\n|xstorycloze_te |lm_eval/tasks/xstorycloze/default_te.yaml |multiple_choice |\n|xstorycloze_zh |lm_eval/tasks/xstorycloze/default_zh.yaml |multiple_choice |\n|xwinograd_en |lm_eval/tasks/xwinograd/xwinograd_en.yaml |multiple_choice |\n|xwinograd_fr |lm_eval/tasks/xwinograd/xwinograd_fr.yaml |multiple_choice |\n|xwinograd_jp |lm_eval/tasks/xwinograd/xwinograd_jp.yaml |multiple_choice |\n|xwinograd_pt |lm_eval/tasks/xwinograd/xwinograd_pt.yaml |multiple_choice |\n|xwinograd_ru |lm_eval/tasks/xwinograd/xwinograd_ru.yaml |multiple_choice |\n|xwinograd_zh |lm_eval/tasks/xwinograd/xwinograd_zh.yaml |multiple_choice |\n\n\n\n","output_type":"stream"}]},{"cell_type":"code","source":"!lm_eval --model hf \\\n --model_args pretrained=${MODEL_NAME} \\\n --tasks mmlu \\\n --device cuda:0 \\\n --num_fewshot 0 \\\n --batch_size 4 \\\n --output_path results \\\n --use_cache True\\\n --log_samples \\\n --limit 10 \\\n --hf_hub_log_args hub_results_org=aisuko,hub_repo_name=eval-smolLM-135M,push_results_to_hub=True,push_samples_to_hub=True,public_repo=False","metadata":{"execution":{"iopub.status.busy":"2024-09-02T06:38:25.136350Z","iopub.execute_input":"2024-09-02T06:38:25.136803Z","iopub.status.idle":"2024-09-02T06:40:58.895644Z","shell.execute_reply.started":"2024-09-02T06:38:25.136725Z","shell.execute_reply":"2024-09-02T06:40:58.894432Z"},"trusted":true},"execution_count":6,"outputs":[{"name":"stdout","text":"config.json: 100%|█████████████████████████████| 723/723 [00:00<00:00, 3.78MB/s]\ntokenizer_config.json: 100%|███████████████| 3.59k/3.59k [00:00<00:00, 19.6MB/s]\nvocab.json: 100%|████████████████████████████| 801k/801k [00:00<00:00, 6.47MB/s]\nmerges.txt: 100%|████████████████████████████| 466k/466k [00:00<00:00, 7.21MB/s]\ntokenizer.json: 100%|██████████████████████| 2.10M/2.10M [00:00<00:00, 10.6MB/s]\nspecial_tokens_map.json: 100%|█████████████████| 565/565 [00:00<00:00, 3.13MB/s]\nmodel.safetensors: 100%|██████████████████████| 269M/269M [00:01<00:00, 244MB/s]\ngeneration_config.json: 100%|███████████████████| 156/156 [00:00<00:00, 788kB/s]\nDownloading builder script: 100%|██████████| 5.86k/5.86k [00:00<00:00, 65.9kB/s]\nDownloading readme: 100%|██████████████████| 1.11k/1.11k [00:00<00:00, 11.2kB/s]\nDownloading data: 100%|███████████████████████| 166M/166M [00:00<00:00, 178MB/s]\nGenerating test split: 204 examples [00:00, 1082.62 examples/s]\nGenerating validation split: 22 examples [00:00, 1940.38 examples/s]\nGenerating dev split: 5 examples [00:00, 29.83 examples/s]\nGenerating test split: 108 examples [00:00, 730.00 examples/s]\nGenerating validation split: 11 examples [00:00, 1170.29 examples/s]\nGenerating dev split: 5 examples [00:00, 32.81 examples/s]\nGenerating test split: 1534 examples [00:00, 5271.17 examples/s]\nGenerating validation split: 170 examples [00:00, 7412.67 examples/s]\nGenerating dev split: 5 examples [00:00, 32.25 examples/s]\nGenerating test split: 163 examples [00:00, 1085.53 examples/s]\nGenerating validation split: 18 examples [00:00, 1727.20 examples/s]\nGenerating dev split: 5 examples [00:00, 30.67 examples/s]\nGenerating test split: 165 examples [00:00, 1081.78 examples/s]\nGenerating validation split: 18 examples [00:00, 1961.99 examples/s]\nGenerating dev split: 5 examples [00:00, 34.10 examples/s]\nGenerating test split: 311 examples [00:00, 1896.89 examples/s]\nGenerating validation split: 34 examples [00:00, 5415.70 examples/s]\nGenerating dev split: 5 examples [00:00, 33.22 examples/s]\nGenerating test split: 121 examples [00:00, 801.22 examples/s]\nGenerating validation split: 13 examples [00:00, 2093.85 examples/s]\nGenerating dev split: 5 examples [00:00, 33.67 examples/s]\nGenerating test split: 895 examples [00:00, 4012.07 examples/s]\nGenerating validation split: 100 examples [00:00, 6663.13 examples/s]\nGenerating dev split: 5 examples [00:00, 31.00 examples/s]\nGenerating test split: 324 examples [00:00, 1869.19 examples/s]\nGenerating validation split: 35 examples [00:00, 3962.12 examples/s]\nGenerating dev split: 5 examples [00:00, 32.67 examples/s]\nGenerating test split: 126 examples [00:00, 817.53 examples/s]\nGenerating validation split: 14 examples [00:00, 2178.29 examples/s]\nGenerating dev split: 5 examples [00:00, 32.93 examples/s]\nGenerating test split: 171 examples [00:00, 1093.55 examples/s]\nGenerating validation split: 19 examples [00:00, 2799.74 examples/s]\nGenerating dev split: 5 examples [00:00, 32.77 examples/s]\nGenerating test split: 237 examples [00:00, 1227.70 examples/s]\nGenerating validation split: 26 examples [00:00, 2579.03 examples/s]\nGenerating dev split: 5 examples [00:00, 32.46 examples/s]\nGenerating test split: 346 examples [00:00, 1914.51 examples/s]\nGenerating validation split: 38 examples [00:00, 3189.46 examples/s]\nGenerating dev split: 5 examples [00:00, 33.24 examples/s]\nGenerating test split: 114 examples [00:00, 766.72 examples/s]\nGenerating validation split: 12 examples [00:00, 2016.25 examples/s]\nGenerating dev split: 5 examples [00:00, 33.13 examples/s]\nGenerating test split: 390 examples [00:00, 2279.23 examples/s]\nGenerating validation split: 43 examples [00:00, 4930.29 examples/s]\nGenerating dev split: 5 examples [00:00, 32.66 examples/s]\nGenerating test split: 100 examples [00:00, 652.03 examples/s]\nGenerating validation split: 11 examples [00:00, 1984.83 examples/s]\nGenerating dev split: 5 examples [00:00, 32.47 examples/s]\nGenerating test split: 201 examples [00:00, 1306.48 examples/s]\nGenerating validation split: 22 examples [00:00, 2348.67 examples/s]\nGenerating dev split: 5 examples [00:00, 33.30 examples/s]\nGenerating test split: 545 examples [00:00, 3003.41 examples/s]\nGenerating validation split: 60 examples [00:00, 4677.05 examples/s]\nGenerating dev split: 5 examples [00:00, 31.47 examples/s]\nGenerating test split: 612 examples [00:00, 3238.44 examples/s]\nGenerating validation split: 69 examples [00:00, 5235.10 examples/s]\nGenerating dev split: 5 examples [00:00, 32.40 examples/s]\nGenerating test split: 198 examples [00:00, 1234.16 examples/s]\nGenerating validation split: 22 examples [00:00, 4336.22 examples/s]\nGenerating dev split: 5 examples [00:00, 33.44 examples/s]\nGenerating test split: 238 examples [00:00, 1442.63 examples/s]\nGenerating validation split: 26 examples [00:00, 2925.37 examples/s]\nGenerating dev split: 5 examples [00:00, 32.99 examples/s]\nGenerating test split: 193 examples [00:00, 1273.87 examples/s]\nGenerating validation split: 21 examples [00:00, 3843.95 examples/s]\nGenerating dev split: 5 examples [00:00, 32.84 examples/s]\nGenerating test split: 245 examples [00:00, 1558.07 examples/s]\nGenerating validation split: 27 examples [00:00, 2620.47 examples/s]\nGenerating dev split: 5 examples [00:00, 33.66 examples/s]\nGenerating test split: 131 examples [00:00, 883.09 examples/s]\nGenerating validation split: 12 examples [00:00, 1923.70 examples/s]\nGenerating dev split: 5 examples [00:00, 33.42 examples/s]\nGenerating test split: 110 examples [00:00, 742.36 examples/s]\nGenerating validation split: 12 examples [00:00, 1685.93 examples/s]\nGenerating dev split: 5 examples [00:00, 33.92 examples/s]\nGenerating test split: 100 examples [00:00, 617.84 examples/s]\nGenerating validation split: 10 examples [00:00, 2385.70 examples/s]\nGenerating dev split: 5 examples [00:00, 31.96 examples/s]\nGenerating test split: 234 examples [00:00, 1458.41 examples/s]\nGenerating validation split: 25 examples [00:00, 3112.61 examples/s]\nGenerating dev split: 5 examples [00:00, 33.83 examples/s]\nGenerating test split: 103 examples [00:00, 710.61 examples/s]\nGenerating validation split: 11 examples [00:00, 1732.92 examples/s]\nGenerating dev split: 5 examples [00:00, 33.15 examples/s]\nGenerating test split: 265 examples [00:00, 1613.07 examples/s]\nGenerating validation split: 29 examples [00:00, 4348.14 examples/s]\nGenerating dev split: 5 examples [00:00, 34.30 examples/s]\nGenerating test split: 282 examples [00:00, 1676.99 examples/s]\nGenerating validation split: 31 examples [00:00, 3057.86 examples/s]\nGenerating dev split: 5 examples [00:00, 34.24 examples/s]\nGenerating test split: 783 examples [00:00, 3767.20 examples/s]\nGenerating validation split: 86 examples [00:00, 7690.23 examples/s]\nGenerating dev split: 5 examples [00:00, 32.14 examples/s]\nGenerating test split: 223 examples [00:00, 1435.26 examples/s]\nGenerating validation split: 23 examples [00:00, 3141.19 examples/s]\nGenerating dev split: 5 examples [00:00, 33.99 examples/s]\nGenerating test split: 173 examples [00:00, 1133.52 examples/s]\nGenerating validation split: 22 examples [00:00, 3645.20 examples/s]\nGenerating dev split: 5 examples [00:00, 34.38 examples/s]\nGenerating test split: 166 examples [00:00, 1113.05 examples/s]\nGenerating validation split: 18 examples [00:00, 2562.89 examples/s]\nGenerating dev split: 5 examples [00:00, 33.85 examples/s]\nGenerating test split: 100 examples [00:00, 678.24 examples/s]\nGenerating validation split: 11 examples [00:00, 2537.25 examples/s]\nGenerating dev split: 5 examples [00:00, 32.61 examples/s]\nGenerating test split: 306 examples [00:00, 1718.43 examples/s]\nGenerating validation split: 33 examples [00:00, 3861.94 examples/s]\nGenerating dev split: 5 examples [00:00, 34.11 examples/s]\nGenerating test split: 272 examples [00:00, 1642.07 examples/s]\nGenerating validation split: 31 examples [00:00, 3206.18 examples/s]\nGenerating dev split: 5 examples [00:00, 33.39 examples/s]\nGenerating test split: 100 examples [00:00, 673.56 examples/s]\nGenerating validation split: 11 examples [00:00, 2330.05 examples/s]\nGenerating dev split: 5 examples [00:00, 33.96 examples/s]\nGenerating test split: 151 examples [00:00, 987.61 examples/s]\nGenerating validation split: 17 examples [00:00, 2705.49 examples/s]\nGenerating dev split: 5 examples [00:00, 32.86 examples/s]\nGenerating test split: 378 examples [00:00, 2172.68 examples/s]\nGenerating validation split: 41 examples [00:00, 3947.72 examples/s]\nGenerating dev split: 5 examples [00:00, 33.64 examples/s]\nGenerating test split: 100 examples [00:00, 692.06 examples/s]\nGenerating validation split: 8 examples [00:00, 934.87 examples/s]\nGenerating dev split: 5 examples [00:00, 33.52 examples/s]\nGenerating test split: 144 examples [00:00, 976.62 examples/s]\nGenerating validation split: 16 examples [00:00, 1936.20 examples/s]\nGenerating dev split: 5 examples [00:00, 33.67 examples/s]\nGenerating test split: 100 examples [00:00, 664.91 examples/s]\nGenerating validation split: 11 examples [00:00, 1802.31 examples/s]\nGenerating dev split: 5 examples [00:00, 34.31 examples/s]\nGenerating test split: 112 examples [00:00, 744.62 examples/s]\nGenerating validation split: 11 examples [00:00, 2207.00 examples/s]\nGenerating dev split: 5 examples [00:00, 32.60 examples/s]\nGenerating test split: 270 examples [00:00, 1688.26 examples/s]\nGenerating validation split: 29 examples [00:00, 4663.02 examples/s]\nGenerating dev split: 5 examples [00:00, 33.60 examples/s]\nGenerating test split: 216 examples [00:00, 1333.86 examples/s]\nGenerating validation split: 23 examples [00:00, 3439.18 examples/s]\nGenerating dev split: 5 examples [00:00, 32.13 examples/s]\nGenerating test split: 135 examples [00:00, 904.02 examples/s]\nGenerating validation split: 14 examples [00:00, 2932.79 examples/s]\nGenerating dev split: 5 examples [00:00, 33.55 examples/s]\nGenerating test split: 100 examples [00:00, 684.27 examples/s]\nGenerating validation split: 9 examples [00:00, 2396.44 examples/s]\nGenerating dev split: 5 examples [00:00, 33.51 examples/s]\nGenerating test split: 100 examples [00:00, 655.17 examples/s]\nGenerating validation split: 11 examples [00:00, 1385.80 examples/s]\nGenerating dev split: 5 examples [00:00, 32.85 examples/s]\nGenerating test split: 100 examples [00:00, 697.73 examples/s]\nGenerating validation split: 11 examples [00:00, 2268.19 examples/s]\nGenerating dev split: 5 examples [00:00, 31.48 examples/s]\nGenerating test split: 152 examples [00:00, 936.74 examples/s]\nGenerating validation split: 16 examples [00:00, 3553.18 examples/s]\nGenerating dev split: 5 examples [00:00, 31.47 examples/s]\nGenerating test split: 102 examples [00:00, 667.42 examples/s]\nGenerating validation split: 11 examples [00:00, 2362.14 examples/s]\nGenerating dev split: 5 examples [00:00, 34.12 examples/s]\nGenerating test split: 100 examples [00:00, 681.36 examples/s]\nGenerating validation split: 11 examples [00:00, 1819.94 examples/s]\nGenerating dev split: 5 examples [00:00, 32.94 examples/s]\nGenerating test split: 145 examples [00:00, 993.89 examples/s]\nGenerating validation split: 16 examples [00:00, 2952.05 examples/s]\nGenerating dev split: 5 examples [00:00, 33.17 examples/s]\nGenerating test split: 235 examples [00:00, 1506.95 examples/s]\nGenerating validation split: 26 examples [00:00, 5103.75 examples/s]\nGenerating dev split: 5 examples [00:00, 32.97 examples/s]\nGenerating test split: 310 examples [00:00, 1896.75 examples/s]\nGenerating validation split: 32 examples [00:00, 4609.76 examples/s]\nGenerating dev split: 5 examples [00:00, 33.32 examples/s]\nGenerating test split: 203 examples [00:00, 1320.57 examples/s]\nGenerating validation split: 22 examples [00:00, 2718.28 examples/s]\nGenerating dev split: 5 examples [00:00, 34.26 examples/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 454.76it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 453.44it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 457.79it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 452.46it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 465.42it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 462.87it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 459.02it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 451.13it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 453.89it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 447.15it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 452.64it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 462.31it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 451.31it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 463.72it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 456.86it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 465.06it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 465.86it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 455.19it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 462.33it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 457.13it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 466.03it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 454.56it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 468.77it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 456.01it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 463.44it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 461.38it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 464.44it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 453.13it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 460.05it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 465.86it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 442.89it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 468.22it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 411.25it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 455.70it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 451.29it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 459.23it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 455.58it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 453.61it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 464.07it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 428.60it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 467.43it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 455.23it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 461.55it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 458.16it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 464.54it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 447.43it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 467.55it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 455.97it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 463.19it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 471.83it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 457.00it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 469.97it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 452.18it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 462.99it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 453.39it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 471.19it/s]\n100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 455.96it/s]\nChecking cached requests: 100%|███████████| 2280/2280 [00:00<00:00, 4853.37it/s]\nRunning loglikelihood requests: 0%| | 0/2280 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)\nRunning loglikelihood requests: 100%|██████| 2280/2280 [00:09<00:00, 228.36it/s]\nTraceback (most recent call last):\n File \"/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_errors.py\", line 304, in hf_raise_for_status\n response.raise_for_status()\n File \"/opt/conda/lib/python3.10/site-packages/requests/models.py\", line 1024, in raise_for_status\n raise HTTPError(http_error_msg, response=self)\nrequests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://huggingface.co/api/datasets/aisuko/eval-smolLM-135M-private/tree/main?recursive=True&expand=False\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n File \"/opt/conda/bin/lm_eval\", line 8, in <module>\n sys.exit(cli_evaluate())\n File \"/kaggle/working/lm-evaluation-harness/lm_eval/__main__.py\", line 452, in cli_evaluate\n evaluation_tracker.recreate_metadata_card()\n File \"/kaggle/working/lm-evaluation-harness/lm_eval/loggers/evaluation_tracker.py\", line 374, in recreate_metadata_card\n files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type=\"dataset\")\n File \"/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py\", line 114, in _inner_fn\n return fn(*args, **kwargs)\n File \"/opt/conda/lib/python3.10/site-packages/huggingface_hub/hf_api.py\", line 2775, in list_repo_files\n return [\n File \"/opt/conda/lib/python3.10/site-packages/huggingface_hub/hf_api.py\", line 2775, in <listcomp>\n return [\n File \"/opt/conda/lib/python3.10/site-packages/huggingface_hub/hf_api.py\", line 2912, in list_repo_tree\n for path_info in paginate(path=tree_url, headers=headers, params={\"recursive\": recursive, \"expand\": expand}):\n File \"/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_pagination.py\", line 37, in paginate\n hf_raise_for_status(r)\n File \"/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_errors.py\", line 352, in hf_raise_for_status\n raise RepositoryNotFoundError(message, response) from e\nhuggingface_hub.utils._errors.RepositoryNotFoundError: 404 Client Error. (Request ID: Root=1-66d55df9-1d203add30f89f104017de30;6761585a-4fd1-4f86-afaa-0121585bcea0)\n\nRepository Not Found for url: https://huggingface.co/api/datasets/aisuko/eval-smolLM-135M-private/tree/main?recursive=True&expand=False.\nPlease make sure you specified the correct `repo_id` and `repo_type`.\nIf you are trying to access a private or gated repo, make sure you are authenticated.\n","output_type":"stream"}]},{"cell_type":"markdown","source":"# Related issues\n\n* https://github.com/EleutherAI/lm-evaluation-harness/issues/2263","metadata":{}}]}