diff --git a/example.ipynb b/example.ipynb index 0b86a72..98695db 100644 --- a/example.ipynb +++ b/example.ipynb @@ -55,59 +55,596 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Step 1: Loading dataset\n", - "Loaded 1 queries from data/annotation/my_sample_dataset/dataset.jsonl\n", - "Step 2: Creating pairs for AI scoring\n" + "Obtaining file:///C:/Users/polasani%20rohit/OneDrive/Desktop/zbench\n", + " Installing build dependencies: started\n", + " Installing build dependencies: finished with status 'done'\n", + " Checking if build backend supports build_editable: started\n", + " Checking if build backend supports build_editable: finished with status 'done'\n", + " Getting requirements to build editable: started\n", + " Getting requirements to build editable: finished with status 'done'\n", + " Preparing editable metadata (pyproject.toml): started\n", + " Preparing editable metadata (pyproject.toml): finished with status 'done'\n", + "Collecting openlimit@ git+https://github.com/shobrook/openlimit.git@master (from zbench==0.1.0)\n", + " Cloning https://github.com/shobrook/openlimit.git (to revision master) to c:\\users\\polasani rohit\\appdata\\local\\temp\\pip-install-x2voovyh\\openlimit_f3705ea5465049dbbc8500a67df63878\n", + " Resolved https://github.com/shobrook/openlimit.git to commit dbacec38467ab17d99607871ef6742d301c34470\n", + " Installing build dependencies: started\n", + " Installing build dependencies: finished with status 'done'\n", + " Getting requirements to build wheel: started\n", + " Getting requirements to build wheel: finished with status 'done'\n", + " Preparing metadata (pyproject.toml): started\n", + " Preparing metadata (pyproject.toml): finished with status 'done'\n", + "Collecting pydantic==2.11.7 (from zbench==0.1.0)\n", + " Obtaining dependency information for pydantic==2.11.7 from https://files.pythonhosted.org/packages/6a/c0/ec2b1c8712ca690e5d61979dee872603e92b8a32f94cc1b72d53beab008a/pydantic-2.11.7-py3-none-any.whl.metadata\n", + " Downloading pydantic-2.11.7-py3-none-any.whl.metadata (67 kB)\n", + " ---------------------------------------- 0.0/68.0 kB ? eta -:--:--\n", + " ---------------------------------------- 0.0/68.0 kB ? eta -:--:--\n", + " ------ --------------------------------- 10.2/68.0 kB ? eta -:--:--\n", + " ---------------------------------- --- 61.4/68.0 kB 825.8 kB/s eta 0:00:01\n", + " -------------------------------------- 68.0/68.0 kB 740.1 kB/s eta 0:00:00\n", + "Collecting numpy==2.3.1 (from zbench==0.1.0)\n", + " Obtaining dependency information for numpy==2.3.1 from https://files.pythonhosted.org/packages/b1/3e/e28f4c1dd9e042eb57a3eb652f200225e311b608632bc727ae378623d4f8/numpy-2.3.1-cp312-cp312-win_amd64.whl.metadata\n", + " Downloading numpy-2.3.1-cp312-cp312-win_amd64.whl.metadata (60 kB)\n", + " ---------------------------------------- 0.0/60.9 kB ? eta -:--:--\n", + " ---------------------------------------- 60.9/60.9 kB 3.2 MB/s eta 0:00:00\n", + "Collecting tqdm==4.67.1 (from zbench==0.1.0)\n", + " Obtaining dependency information for tqdm==4.67.1 from https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl.metadata\n", + " Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)\n", + "Collecting aiohttp==3.12.14 (from zbench==0.1.0)\n", + " Obtaining dependency information for aiohttp==3.12.14 from https://files.pythonhosted.org/packages/98/d5/7ac2464aebd2eecac38dbe96148c9eb487679c512449ba5215d233755582/aiohttp-3.12.14-cp312-cp312-win_amd64.whl.metadata\n", + " Downloading aiohttp-3.12.14-cp312-cp312-win_amd64.whl.metadata (7.9 kB)\n", + "Collecting python-dotenv==1.1.1 (from zbench==0.1.0)\n", + " Obtaining dependency information for python-dotenv==1.1.1 from https://files.pythonhosted.org/packages/5f/ed/539768cf28c661b5b068d66d96a2f155c4971a5d55684a514c1a0e0dec2f/python_dotenv-1.1.1-py3-none-any.whl.metadata\n", + " Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)\n", + "Collecting datasets==4.0.0 (from zbench==0.1.0)\n", + " Obtaining dependency information for datasets==4.0.0 from https://files.pythonhosted.org/packages/eb/62/eb8157afb21bd229c864521c1ab4fa8e9b4f1b06bafdd8c4668a7a31b5dd/datasets-4.0.0-py3-none-any.whl.metadata\n", + " Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)\n", + "Collecting anthropic==0.57.1 (from zbench==0.1.0)\n", + " Obtaining dependency information for anthropic==0.57.1 from https://files.pythonhosted.org/packages/e5/cf/ca0ba77805aec6171629a8b665c7dc224dab374539c3d27005b5d8c100a0/anthropic-0.57.1-py3-none-any.whl.metadata\n", + " Downloading anthropic-0.57.1-py3-none-any.whl.metadata (27 kB)\n", + "Collecting openai==1.97.0 (from zbench==0.1.0)\n", + " Obtaining dependency information for openai==1.97.0 from https://files.pythonhosted.org/packages/8a/91/1f1cf577f745e956b276a8b1d3d76fa7a6ee0c2b05db3b001b900f2c71db/openai-1.97.0-py3-none-any.whl.metadata\n", + " Downloading openai-1.97.0-py3-none-any.whl.metadata (29 kB)\n", + "Collecting redis==6.2.0 (from zbench==0.1.0)\n", + " Obtaining dependency information for redis==6.2.0 from https://files.pythonhosted.org/packages/13/67/e60968d3b0e077495a8fee89cf3f2373db98e528288a48f1ee44967f6e8c/redis-6.2.0-py3-none-any.whl.metadata\n", + " Downloading redis-6.2.0-py3-none-any.whl.metadata (10 kB)\n", + "Collecting tiktoken==0.9.0 (from zbench==0.1.0)\n", + " Obtaining dependency information for tiktoken==0.9.0 from https://files.pythonhosted.org/packages/5b/64/b16003419a1d7728d0d8c0d56a4c24325e7b10a21a9dd1fc0f7115c02f0a/tiktoken-0.9.0-cp312-cp312-win_amd64.whl.metadata\n", + " Downloading tiktoken-0.9.0-cp312-cp312-win_amd64.whl.metadata (6.8 kB)\n", + "Collecting loguru==0.7.3 (from zbench==0.1.0)\n", + " Obtaining dependency information for loguru==0.7.3 from https://files.pythonhosted.org/packages/0c/29/0348de65b8cc732daa3e33e67806420b2ae89bdce2b04af740289c5c6c8c/loguru-0.7.3-py3-none-any.whl.metadata\n", + " Using cached loguru-0.7.3-py3-none-any.whl.metadata (22 kB)\n", + "Collecting zeroentropy==0.1.0a6 (from zbench==0.1.0)\n", + " Obtaining dependency information for zeroentropy==0.1.0a6 from https://files.pythonhosted.org/packages/3b/7e/594e9ec5cda6d8f4dc249c5ca0c6b31b19886eafedcea6c41a2400a2a7b9/zeroentropy-0.1.0a6-py3-none-any.whl.metadata\n", + " Downloading zeroentropy-0.1.0a6-py3-none-any.whl.metadata (16 kB)\n", + "Collecting matplotlib==3.10.3 (from zbench==0.1.0)\n", + " Obtaining dependency information for matplotlib==3.10.3 from https://files.pythonhosted.org/packages/b1/79/0d1c165eac44405a86478082e225fce87874f7198300bbebc55faaf6d28d/matplotlib-3.10.3-cp312-cp312-win_amd64.whl.metadata\n", + " Downloading matplotlib-3.10.3-cp312-cp312-win_amd64.whl.metadata (11 kB)\n", + "Collecting ipykernel==6.30.0 (from zbench==0.1.0)\n", + " Obtaining dependency information for ipykernel==6.30.0 from https://files.pythonhosted.org/packages/1f/3d/00813c3d9b46e3dcd88bd4530e0a3c63c0509e5d8c9eff34723ea243ab04/ipykernel-6.30.0-py3-none-any.whl.metadata\n", + " Downloading ipykernel-6.30.0-py3-none-any.whl.metadata (6.2 kB)\n", + "Collecting ipywidgets==8.1.7 (from zbench==0.1.0)\n", + " Obtaining dependency information for ipywidgets==8.1.7 from https://files.pythonhosted.org/packages/58/6a/9166369a2f092bd286d24e6307de555d63616e8ddb373ebad2b5635ca4cd/ipywidgets-8.1.7-py3-none-any.whl.metadata\n", + " Downloading ipywidgets-8.1.7-py3-none-any.whl.metadata (2.4 kB)\n", + "Collecting aiohappyeyeballs>=2.5.0 (from aiohttp==3.12.14->zbench==0.1.0)\n", + " Obtaining dependency information for aiohappyeyeballs>=2.5.0 from https://files.pythonhosted.org/packages/0f/15/5bf3b99495fb160b63f95972b81750f18f7f4e02ad051373b669d17d44f2/aiohappyeyeballs-2.6.1-py3-none-any.whl.metadata\n", + " Downloading aiohappyeyeballs-2.6.1-py3-none-any.whl.metadata (5.9 kB)\n", + "Collecting aiosignal>=1.4.0 (from aiohttp==3.12.14->zbench==0.1.0)\n", + " Obtaining dependency information for aiosignal>=1.4.0 from https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl.metadata\n", + " Downloading aiosignal-1.4.0-py3-none-any.whl.metadata (3.7 kB)\n", + "Collecting attrs>=17.3.0 (from aiohttp==3.12.14->zbench==0.1.0)\n", + " Obtaining dependency information for attrs>=17.3.0 from https://files.pythonhosted.org/packages/77/06/bb80f5f86020c4551da315d78b3ab75e8228f89f0162f2c3a819e407941a/attrs-25.3.0-py3-none-any.whl.metadata\n", + " Downloading attrs-25.3.0-py3-none-any.whl.metadata (10 kB)\n", + "Collecting frozenlist>=1.1.1 (from aiohttp==3.12.14->zbench==0.1.0)\n", + " Obtaining dependency information for frozenlist>=1.1.1 from https://files.pythonhosted.org/packages/0b/15/c026e9a9fc17585a9d461f65d8593d281fedf55fbf7eb53f16c6df2392f9/frozenlist-1.7.0-cp312-cp312-win_amd64.whl.metadata\n", + " Downloading frozenlist-1.7.0-cp312-cp312-win_amd64.whl.metadata (19 kB)\n", + "Collecting multidict<7.0,>=4.5 (from aiohttp==3.12.14->zbench==0.1.0)\n", + " Obtaining dependency information for multidict<7.0,>=4.5 from https://files.pythonhosted.org/packages/c7/eb/d88b1780d43a56db2cba24289fa744a9d216c1a8546a0dc3956563fd53ea/multidict-6.6.4-cp312-cp312-win_amd64.whl.metadata\n", + " Downloading multidict-6.6.4-cp312-cp312-win_amd64.whl.metadata (5.4 kB)\n", + "Collecting propcache>=0.2.0 (from aiohttp==3.12.14->zbench==0.1.0)\n", + " Obtaining dependency information for propcache>=0.2.0 from https://files.pythonhosted.org/packages/19/61/d582be5d226cf79071681d1b46b848d6cb03d7b70af7063e33a2787eaa03/propcache-0.3.2-cp312-cp312-win_amd64.whl.metadata\n", + " Downloading propcache-0.3.2-cp312-cp312-win_amd64.whl.metadata (12 kB)\n", + "Collecting yarl<2.0,>=1.17.0 (from aiohttp==3.12.14->zbench==0.1.0)\n", + " Obtaining dependency information for yarl<2.0,>=1.17.0 from https://files.pythonhosted.org/packages/eb/83/5d9092950565481b413b31a23e75dd3418ff0a277d6e0abf3729d4d1ce25/yarl-1.20.1-cp312-cp312-win_amd64.whl.metadata\n", + " Downloading yarl-1.20.1-cp312-cp312-win_amd64.whl.metadata (76 kB)\n", + " ---------------------------------------- 0.0/76.3 kB ? eta -:--:--\n", + " ---------------------------------------- 76.3/76.3 kB 4.4 MB/s eta 0:00:00\n", + "Collecting anyio<5,>=3.5.0 (from anthropic==0.57.1->zbench==0.1.0)\n", + " Obtaining dependency information for anyio<5,>=3.5.0 from https://files.pythonhosted.org/packages/6f/12/e5e0282d673bb9746bacfb6e2dba8719989d3660cdb2ea79aee9a9651afb/anyio-4.10.0-py3-none-any.whl.metadata\n", + " Downloading anyio-4.10.0-py3-none-any.whl.metadata (4.0 kB)\n", + "Collecting distro<2,>=1.7.0 (from anthropic==0.57.1->zbench==0.1.0)\n", + " Obtaining dependency information for distro<2,>=1.7.0 from https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl.metadata\n", + " Using cached distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)\n", + "Collecting httpx<1,>=0.25.0 (from anthropic==0.57.1->zbench==0.1.0)\n", + " Obtaining dependency information for httpx<1,>=0.25.0 from https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl.metadata\n", + " Downloading httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)\n", + "Collecting jiter<1,>=0.4.0 (from anthropic==0.57.1->zbench==0.1.0)\n", + " Obtaining dependency information for jiter<1,>=0.4.0 from https://files.pythonhosted.org/packages/9b/52/7ec47455e26f2d6e5f2ea4951a0652c06e5b995c291f723973ae9e724a65/jiter-0.10.0-cp312-cp312-win_amd64.whl.metadata\n", + " Downloading jiter-0.10.0-cp312-cp312-win_amd64.whl.metadata (5.3 kB)\n", + "Collecting sniffio (from anthropic==0.57.1->zbench==0.1.0)\n", + " Obtaining dependency information for sniffio from https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl.metadata\n", + " Using cached sniffio-1.3.1-py3-none-any.whl.metadata (3.9 kB)\n", + "Collecting typing-extensions<5,>=4.10 (from anthropic==0.57.1->zbench==0.1.0)\n", + " Obtaining dependency information for typing-extensions<5,>=4.10 from https://files.pythonhosted.org/packages/b5/00/d631e67a838026495268c2f6884f3711a15a9a2a96cd244fdaea53b823fb/typing_extensions-4.14.1-py3-none-any.whl.metadata\n", + " Downloading typing_extensions-4.14.1-py3-none-any.whl.metadata (3.0 kB)\n", + "Collecting filelock (from datasets==4.0.0->zbench==0.1.0)\n", + " Obtaining dependency information for filelock from https://files.pythonhosted.org/packages/4d/36/2a115987e2d8c300a974597416d9de88f2444426de9571f4b59b2cca3acc/filelock-3.18.0-py3-none-any.whl.metadata\n", + " Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)\n", + "Collecting pyarrow>=15.0.0 (from datasets==4.0.0->zbench==0.1.0)\n", + " Obtaining dependency information for pyarrow>=15.0.0 from https://files.pythonhosted.org/packages/71/30/f3795b6e192c3ab881325ffe172e526499eb3780e306a15103a2764916a2/pyarrow-21.0.0-cp312-cp312-win_amd64.whl.metadata\n", + " Downloading pyarrow-21.0.0-cp312-cp312-win_amd64.whl.metadata (3.4 kB)\n", + "Collecting dill<0.3.9,>=0.3.0 (from datasets==4.0.0->zbench==0.1.0)\n", + " Obtaining dependency information for dill<0.3.9,>=0.3.0 from https://files.pythonhosted.org/packages/c9/7a/cef76fd8438a42f96db64ddaa85280485a9c395e7df3db8158cfec1eee34/dill-0.3.8-py3-none-any.whl.metadata\n", + " Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)\n", + "Collecting pandas (from datasets==4.0.0->zbench==0.1.0)\n", + " Obtaining dependency information for pandas from https://files.pythonhosted.org/packages/80/a5/3a92893e7399a691bad7664d977cb5e7c81cf666c81f89ea76ba2bff483d/pandas-2.3.1-cp312-cp312-win_amd64.whl.metadata\n", + " Downloading pandas-2.3.1-cp312-cp312-win_amd64.whl.metadata (19 kB)\n", + "Collecting requests>=2.32.2 (from datasets==4.0.0->zbench==0.1.0)\n", + " Obtaining dependency information for requests>=2.32.2 from https://files.pythonhosted.org/packages/7c/e4/56027c4a6b4ae70ca9de302488c5ca95ad4a39e190093d6c1a8ace08341b/requests-2.32.4-py3-none-any.whl.metadata\n", + " Downloading requests-2.32.4-py3-none-any.whl.metadata (4.9 kB)\n", + "Collecting xxhash (from datasets==4.0.0->zbench==0.1.0)\n", + " Obtaining dependency information for xxhash from https://files.pythonhosted.org/packages/d9/6b/1c443fe6cfeb4ad1dcf231cdec96eb94fb43d6498b4469ed8b51f8b59a37/xxhash-3.5.0-cp312-cp312-win_amd64.whl.metadata\n", + " Downloading xxhash-3.5.0-cp312-cp312-win_amd64.whl.metadata (13 kB)\n", + "Collecting multiprocess<0.70.17 (from datasets==4.0.0->zbench==0.1.0)\n", + " Obtaining dependency information for multiprocess<0.70.17 from https://files.pythonhosted.org/packages/0a/7d/a988f258104dcd2ccf1ed40fdc97e26c4ac351eeaf81d76e266c52d84e2f/multiprocess-0.70.16-py312-none-any.whl.metadata\n", + " Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)\n", + "Collecting fsspec[http]<=2025.3.0,>=2023.1.0 (from datasets==4.0.0->zbench==0.1.0)\n", + " Obtaining dependency information for fsspec[http]<=2025.3.0,>=2023.1.0 from https://files.pythonhosted.org/packages/56/53/eb690efa8513166adef3e0669afd31e95ffde69fb3c52ec2ac7223ed6018/fsspec-2025.3.0-py3-none-any.whl.metadata\n", + " Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)\n", + "Collecting huggingface-hub>=0.24.0 (from datasets==4.0.0->zbench==0.1.0)\n", + " Obtaining dependency information for huggingface-hub>=0.24.0 from https://files.pythonhosted.org/packages/39/7b/bb06b061991107cd8783f300adff3e7b7f284e330fd82f507f2a1417b11d/huggingface_hub-0.34.4-py3-none-any.whl.metadata\n", + " Downloading huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)\n", + "Requirement already satisfied: packaging in c:\\users\\polasani rohit\\appdata\\roaming\\python\\python312\\site-packages (from datasets==4.0.0->zbench==0.1.0) (25.0)\n", + "Collecting pyyaml>=5.1 (from datasets==4.0.0->zbench==0.1.0)\n", + " Obtaining dependency information for pyyaml>=5.1 from https://files.pythonhosted.org/packages/0c/e8/4f648c598b17c3d06e8753d7d13d57542b30d56e6c2dedf9c331ae56312e/PyYAML-6.0.2-cp312-cp312-win_amd64.whl.metadata\n", + " Downloading PyYAML-6.0.2-cp312-cp312-win_amd64.whl.metadata (2.1 kB)\n", + "Requirement already satisfied: comm>=0.1.1 in c:\\users\\polasani rohit\\appdata\\roaming\\python\\python312\\site-packages (from ipykernel==6.30.0->zbench==0.1.0) (0.2.3)\n", + "Requirement already satisfied: debugpy>=1.6.5 in c:\\users\\polasani rohit\\appdata\\roaming\\python\\python312\\site-packages (from ipykernel==6.30.0->zbench==0.1.0) (1.8.16)\n", + "Requirement already satisfied: ipython>=7.23.1 in c:\\users\\polasani rohit\\appdata\\roaming\\python\\python312\\site-packages (from ipykernel==6.30.0->zbench==0.1.0) (9.4.0)\n", + "Requirement already satisfied: jupyter-client>=8.0.0 in c:\\users\\polasani rohit\\appdata\\roaming\\python\\python312\\site-packages (from ipykernel==6.30.0->zbench==0.1.0) (8.6.3)\n", + "Requirement already satisfied: jupyter-core!=5.0.*,>=4.12 in c:\\users\\polasani rohit\\appdata\\roaming\\python\\python312\\site-packages (from ipykernel==6.30.0->zbench==0.1.0) (5.8.1)\n", + "Requirement already satisfied: matplotlib-inline>=0.1 in c:\\users\\polasani rohit\\appdata\\roaming\\python\\python312\\site-packages (from ipykernel==6.30.0->zbench==0.1.0) (0.1.7)\n", + "Requirement already satisfied: nest-asyncio>=1.4 in c:\\users\\polasani rohit\\appdata\\roaming\\python\\python312\\site-packages (from ipykernel==6.30.0->zbench==0.1.0) (1.6.0)\n", + "Requirement already satisfied: psutil>=5.7 in c:\\users\\polasani rohit\\appdata\\roaming\\python\\python312\\site-packages (from ipykernel==6.30.0->zbench==0.1.0) (7.0.0)\n", + "Requirement already satisfied: pyzmq>=25 in c:\\users\\polasani rohit\\appdata\\roaming\\python\\python312\\site-packages (from ipykernel==6.30.0->zbench==0.1.0) (27.0.1)\n", + "Requirement already satisfied: tornado>=6.2 in c:\\users\\polasani rohit\\appdata\\roaming\\python\\python312\\site-packages (from ipykernel==6.30.0->zbench==0.1.0) (6.5.2)\n", + "Requirement already satisfied: traitlets>=5.4.0 in c:\\users\\polasani rohit\\appdata\\roaming\\python\\python312\\site-packages (from ipykernel==6.30.0->zbench==0.1.0) (5.14.3)\n", + "Collecting widgetsnbextension~=4.0.14 (from ipywidgets==8.1.7->zbench==0.1.0)\n", + " Obtaining dependency information for widgetsnbextension~=4.0.14 from https://files.pythonhosted.org/packages/ca/51/5447876806d1088a0f8f71e16542bf350918128d0a69437df26047c8e46f/widgetsnbextension-4.0.14-py3-none-any.whl.metadata\n", + " Downloading widgetsnbextension-4.0.14-py3-none-any.whl.metadata (1.6 kB)\n", + "Collecting jupyterlab_widgets~=3.0.15 (from ipywidgets==8.1.7->zbench==0.1.0)\n", + " Obtaining dependency information for jupyterlab_widgets~=3.0.15 from https://files.pythonhosted.org/packages/43/6a/ca128561b22b60bd5a0c4ea26649e68c8556b82bc70a0c396eebc977fe86/jupyterlab_widgets-3.0.15-py3-none-any.whl.metadata\n", + " Downloading jupyterlab_widgets-3.0.15-py3-none-any.whl.metadata (20 kB)\n", + "Requirement already satisfied: colorama>=0.3.4 in c:\\users\\polasani rohit\\appdata\\roaming\\python\\python312\\site-packages (from loguru==0.7.3->zbench==0.1.0) (0.4.6)\n", + "Collecting win32-setctime>=1.0.0 (from loguru==0.7.3->zbench==0.1.0)\n", + " Obtaining dependency information for win32-setctime>=1.0.0 from https://files.pythonhosted.org/packages/e1/07/c6fe3ad3e685340704d314d765b7912993bcb8dc198f0e7a89382d37974b/win32_setctime-1.2.0-py3-none-any.whl.metadata\n", + " Using cached win32_setctime-1.2.0-py3-none-any.whl.metadata (2.4 kB)\n", + "Collecting contourpy>=1.0.1 (from matplotlib==3.10.3->zbench==0.1.0)\n", + " Obtaining dependency information for contourpy>=1.0.1 from https://files.pythonhosted.org/packages/19/e8/6026ed58a64563186a9ee3f29f41261fd1828f527dd93d33b60feca63352/contourpy-1.3.3-cp312-cp312-win_amd64.whl.metadata\n", + " Downloading contourpy-1.3.3-cp312-cp312-win_amd64.whl.metadata (5.5 kB)\n", + "Collecting cycler>=0.10 (from matplotlib==3.10.3->zbench==0.1.0)\n", + " Obtaining dependency information for cycler>=0.10 from https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl.metadata\n", + " Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)\n", + "Collecting fonttools>=4.22.0 (from matplotlib==3.10.3->zbench==0.1.0)\n", + " Obtaining dependency information for fonttools>=4.22.0 from https://files.pythonhosted.org/packages/09/45/d2bdc9ea20bbadec1016fd0db45696d573d7a26d95ab5174ffcb6d74340b/fonttools-4.59.0-cp312-cp312-win_amd64.whl.metadata\n", + " Downloading fonttools-4.59.0-cp312-cp312-win_amd64.whl.metadata (110 kB)\n", + " ---------------------------------------- 0.0/110.1 kB ? eta -:--:--\n", + " -------------------------------------- 110.1/110.1 kB 6.2 MB/s eta 0:00:00\n", + "Collecting kiwisolver>=1.3.1 (from matplotlib==3.10.3->zbench==0.1.0)\n", + " Obtaining dependency information for kiwisolver>=1.3.1 from https://files.pythonhosted.org/packages/a0/41/85d82b0291db7504da3c2defe35c9a8a5c9803a730f297bd823d11d5fb77/kiwisolver-1.4.9-cp312-cp312-win_amd64.whl.metadata\n", + " Downloading kiwisolver-1.4.9-cp312-cp312-win_amd64.whl.metadata (6.4 kB)\n", + "Collecting pillow>=8 (from matplotlib==3.10.3->zbench==0.1.0)\n", + " Obtaining dependency information for pillow>=8 from https://files.pythonhosted.org/packages/8c/ce/e7dfc873bdd9828f3b6e5c2bbb74e47a98ec23cc5c74fc4e54462f0d9204/pillow-11.3.0-cp312-cp312-win_amd64.whl.metadata\n", + " Downloading pillow-11.3.0-cp312-cp312-win_amd64.whl.metadata (9.2 kB)\n", + "Collecting pyparsing>=2.3.1 (from matplotlib==3.10.3->zbench==0.1.0)\n", + " Obtaining dependency information for pyparsing>=2.3.1 from https://files.pythonhosted.org/packages/05/e7/df2285f3d08fee213f2d041540fa4fc9ca6c2d44cf36d3a035bf2a8d2bcc/pyparsing-3.2.3-py3-none-any.whl.metadata\n", + " Downloading pyparsing-3.2.3-py3-none-any.whl.metadata (5.0 kB)\n", + "Requirement already satisfied: python-dateutil>=2.7 in c:\\users\\polasani rohit\\appdata\\roaming\\python\\python312\\site-packages (from matplotlib==3.10.3->zbench==0.1.0) (2.9.0.post0)\n", + "Collecting annotated-types>=0.6.0 (from pydantic==2.11.7->zbench==0.1.0)\n", + " Obtaining dependency information for annotated-types>=0.6.0 from https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl.metadata\n", + " Using cached annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)\n", + "Collecting pydantic-core==2.33.2 (from pydantic==2.11.7->zbench==0.1.0)\n", + " Obtaining dependency information for pydantic-core==2.33.2 from https://files.pythonhosted.org/packages/43/e4/5479fecb3606c1368d496a825d8411e126133c41224c1e7238be58b87d7e/pydantic_core-2.33.2-cp312-cp312-win_amd64.whl.metadata\n", + " Downloading pydantic_core-2.33.2-cp312-cp312-win_amd64.whl.metadata (6.9 kB)\n", + "Collecting typing-inspection>=0.4.0 (from pydantic==2.11.7->zbench==0.1.0)\n", + " Obtaining dependency information for typing-inspection>=0.4.0 from https://files.pythonhosted.org/packages/17/69/cd203477f944c353c31bade965f880aa1061fd6bf05ded0726ca845b6ff7/typing_inspection-0.4.1-py3-none-any.whl.metadata\n", + " Downloading typing_inspection-0.4.1-py3-none-any.whl.metadata (2.6 kB)\n", + "Collecting regex>=2022.1.18 (from tiktoken==0.9.0->zbench==0.1.0)\n", + " Obtaining dependency information for regex>=2022.1.18 from https://files.pythonhosted.org/packages/3b/39/bd922b55a4fc5ad5c13753274e5b536f5b06ec8eb9747675668491c7ab7a/regex-2025.7.34-cp312-cp312-win_amd64.whl.metadata\n", + " Downloading regex-2025.7.34-cp312-cp312-win_amd64.whl.metadata (41 kB)\n", + " ---------------------------------------- 0.0/41.5 kB ? eta -:--:--\n", + " ---------------------------------------- 41.5/41.5 kB 2.0 MB/s eta 0:00:00\n", + "Collecting idna>=2.8 (from anyio<5,>=3.5.0->anthropic==0.57.1->zbench==0.1.0)\n", + " Obtaining dependency information for idna>=2.8 from https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl.metadata\n", + " Using cached idna-3.10-py3-none-any.whl.metadata (10 kB)\n", + "Collecting certifi (from httpx<1,>=0.25.0->anthropic==0.57.1->zbench==0.1.0)\n", + " Obtaining dependency information for certifi from https://files.pythonhosted.org/packages/e5/48/1549795ba7742c948d2ad169c1c8cdbae65bc450d6cd753d124b17c8cd32/certifi-2025.8.3-py3-none-any.whl.metadata\n", + " Downloading certifi-2025.8.3-py3-none-any.whl.metadata (2.4 kB)\n", + "Collecting httpcore==1.* (from httpx<1,>=0.25.0->anthropic==0.57.1->zbench==0.1.0)\n", + " Obtaining dependency information for httpcore==1.* from https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl.metadata\n", + " Downloading httpcore-1.0.9-py3-none-any.whl.metadata (21 kB)\n", + "Collecting h11>=0.16 (from httpcore==1.*->httpx<1,>=0.25.0->anthropic==0.57.1->zbench==0.1.0)\n", + " Obtaining dependency information for h11>=0.16 from https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl.metadata\n", + " Downloading h11-0.16.0-py3-none-any.whl.metadata (8.3 kB)\n", + "Requirement already satisfied: decorator in c:\\users\\polasani rohit\\appdata\\roaming\\python\\python312\\site-packages (from ipython>=7.23.1->ipykernel==6.30.0->zbench==0.1.0) (5.2.1)\n", + "Requirement already satisfied: ipython-pygments-lexers in c:\\users\\polasani rohit\\appdata\\roaming\\python\\python312\\site-packages (from ipython>=7.23.1->ipykernel==6.30.0->zbench==0.1.0) (1.1.1)\n", + "Requirement already satisfied: jedi>=0.16 in c:\\users\\polasani rohit\\appdata\\roaming\\python\\python312\\site-packages (from ipython>=7.23.1->ipykernel==6.30.0->zbench==0.1.0) (0.19.2)\n", + "Requirement already satisfied: prompt_toolkit<3.1.0,>=3.0.41 in c:\\users\\polasani rohit\\appdata\\roaming\\python\\python312\\site-packages (from ipython>=7.23.1->ipykernel==6.30.0->zbench==0.1.0) (3.0.51)\n", + "Requirement already satisfied: pygments>=2.4.0 in c:\\users\\polasani rohit\\appdata\\roaming\\python\\python312\\site-packages (from ipython>=7.23.1->ipykernel==6.30.0->zbench==0.1.0) (2.19.2)\n", + "Requirement already satisfied: stack_data in c:\\users\\polasani rohit\\appdata\\roaming\\python\\python312\\site-packages (from ipython>=7.23.1->ipykernel==6.30.0->zbench==0.1.0) (0.6.3)\n", + "Requirement already satisfied: platformdirs>=2.5 in c:\\users\\polasani rohit\\appdata\\roaming\\python\\python312\\site-packages (from jupyter-core!=5.0.*,>=4.12->ipykernel==6.30.0->zbench==0.1.0) (4.3.8)\n", + "Requirement already satisfied: pywin32>=300 in c:\\users\\polasani rohit\\appdata\\roaming\\python\\python312\\site-packages (from jupyter-core!=5.0.*,>=4.12->ipykernel==6.30.0->zbench==0.1.0) (311)\n", + "Requirement already satisfied: six>=1.5 in c:\\users\\polasani rohit\\appdata\\roaming\\python\\python312\\site-packages (from python-dateutil>=2.7->matplotlib==3.10.3->zbench==0.1.0) (1.17.0)\n", + "Collecting charset_normalizer<4,>=2 (from requests>=2.32.2->datasets==4.0.0->zbench==0.1.0)\n", + " Obtaining dependency information for charset_normalizer<4,>=2 from https://files.pythonhosted.org/packages/39/f5/3b3836ca6064d0992c58c7561c6b6eee1b3892e9665d650c803bd5614522/charset_normalizer-3.4.3-cp312-cp312-win_amd64.whl.metadata\n", + " Downloading charset_normalizer-3.4.3-cp312-cp312-win_amd64.whl.metadata (37 kB)\n", + "Collecting urllib3<3,>=1.21.1 (from requests>=2.32.2->datasets==4.0.0->zbench==0.1.0)\n", + " Obtaining dependency information for urllib3<3,>=1.21.1 from https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl.metadata\n", + " Downloading urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)\n", + "Collecting pytz>=2020.1 (from pandas->datasets==4.0.0->zbench==0.1.0)\n", + " Obtaining dependency information for pytz>=2020.1 from https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl.metadata\n", + " Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)\n", + "Collecting tzdata>=2022.7 (from pandas->datasets==4.0.0->zbench==0.1.0)\n", + " Obtaining dependency information for tzdata>=2022.7 from https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl.metadata\n", + " Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)\n", + "Requirement already satisfied: parso<0.9.0,>=0.8.4 in c:\\users\\polasani rohit\\appdata\\roaming\\python\\python312\\site-packages (from jedi>=0.16->ipython>=7.23.1->ipykernel==6.30.0->zbench==0.1.0) (0.8.4)\n", + "Requirement already satisfied: wcwidth in c:\\users\\polasani rohit\\appdata\\roaming\\python\\python312\\site-packages (from prompt_toolkit<3.1.0,>=3.0.41->ipython>=7.23.1->ipykernel==6.30.0->zbench==0.1.0) (0.2.13)\n", + "Requirement already satisfied: executing>=1.2.0 in c:\\users\\polasani rohit\\appdata\\roaming\\python\\python312\\site-packages (from stack_data->ipython>=7.23.1->ipykernel==6.30.0->zbench==0.1.0) (2.2.0)\n", + "Requirement already satisfied: asttokens>=2.1.0 in c:\\users\\polasani rohit\\appdata\\roaming\\python\\python312\\site-packages (from stack_data->ipython>=7.23.1->ipykernel==6.30.0->zbench==0.1.0) (3.0.0)\n", + "Requirement already satisfied: pure-eval in c:\\users\\polasani rohit\\appdata\\roaming\\python\\python312\\site-packages (from stack_data->ipython>=7.23.1->ipykernel==6.30.0->zbench==0.1.0) (0.2.3)\n", + "Downloading aiohttp-3.12.14-cp312-cp312-win_amd64.whl (449 kB)\n", + " ---------------------------------------- 0.0/449.3 kB ? eta -:--:--\n", + " --------------------------------------- 440.3/449.3 kB 9.2 MB/s eta 0:00:01\n", + " ---------------------------------------- 449.3/449.3 kB 9.3 MB/s eta 0:00:00\n", + "Downloading anthropic-0.57.1-py3-none-any.whl (292 kB)\n", + " ---------------------------------------- 0.0/292.8 kB ? eta -:--:--\n", + " ---------------------------------------- 292.8/292.8 kB 9.1 MB/s eta 0:00:00\n", + "Downloading datasets-4.0.0-py3-none-any.whl (494 kB)\n", + " ---------------------------------------- 0.0/494.8 kB ? eta -:--:--\n", + " -------------------------------------- 491.5/494.8 kB 30.1 MB/s eta 0:00:01\n", + " --------------------------------------- 494.8/494.8 kB 15.6 MB/s eta 0:00:00\n", + "Downloading ipykernel-6.30.0-py3-none-any.whl (117 kB)\n", + " ---------------------------------------- 0.0/117.3 kB ? eta -:--:--\n", + " ---------------------------------------- 117.3/117.3 kB 6.7 MB/s eta 0:00:00\n", + "Downloading ipywidgets-8.1.7-py3-none-any.whl (139 kB)\n", + " ---------------------------------------- 0.0/139.8 kB ? eta -:--:--\n", + " ---------------------------------------- 139.8/139.8 kB 8.1 MB/s eta 0:00:00\n", + "Using cached loguru-0.7.3-py3-none-any.whl (61 kB)\n", + "Downloading matplotlib-3.10.3-cp312-cp312-win_amd64.whl (8.1 MB)\n", + " ---------------------------------------- 0.0/8.1 MB ? eta -:--:--\n", + " -- ------------------------------------- 0.6/8.1 MB 18.5 MB/s eta 0:00:01\n", + " ---- ----------------------------------- 0.9/8.1 MB 11.7 MB/s eta 0:00:01\n", + " ------- -------------------------------- 1.5/8.1 MB 13.3 MB/s eta 0:00:01\n", + " ---------- ----------------------------- 2.2/8.1 MB 12.4 MB/s eta 0:00:01\n", + " ------------ --------------------------- 2.6/8.1 MB 11.7 MB/s eta 0:00:01\n", + " --------------- ------------------------ 3.2/8.1 MB 12.1 MB/s eta 0:00:01\n", + " ------------------- -------------------- 3.9/8.1 MB 12.3 MB/s eta 0:00:01\n", + " -------------------- ------------------- 4.2/8.1 MB 12.7 MB/s eta 0:00:01\n", + " -------------------- ------------------- 4.2/8.1 MB 12.7 MB/s eta 0:00:01\n", + " -------------------- ------------------- 4.2/8.1 MB 12.7 MB/s eta 0:00:01\n", + " -------------------- ------------------- 4.2/8.1 MB 12.7 MB/s eta 0:00:01\n", + " -------------------- ------------------- 4.2/8.1 MB 12.7 MB/s eta 0:00:01\n", + " ------------------------ --------------- 4.9/8.1 MB 8.2 MB/s eta 0:00:01\n", + " ------------------------- -------------- 5.2/8.1 MB 8.4 MB/s eta 0:00:01\n", + " ------------------------- -------------- 5.2/8.1 MB 8.4 MB/s eta 0:00:01\n", + " ------------------------- -------------- 5.2/8.1 MB 8.4 MB/s eta 0:00:01\n", + " ------------------------- -------------- 5.2/8.1 MB 8.4 MB/s eta 0:00:01\n", + " ---------------------------- ----------- 5.7/8.1 MB 6.8 MB/s eta 0:00:01\n", + " ------------------------------- -------- 6.3/8.1 MB 7.2 MB/s eta 0:00:01\n", + " ----------------------------------- ---- 7.1/8.1 MB 7.7 MB/s eta 0:00:01\n", + " -------------------------------------- - 7.9/8.1 MB 8.1 MB/s eta 0:00:01\n", + " --------------------------------------- 8.1/8.1 MB 8.3 MB/s eta 0:00:01\n", + " --------------------------------------- 8.1/8.1 MB 8.3 MB/s eta 0:00:01\n", + " ---------------------------------------- 8.1/8.1 MB 7.4 MB/s eta 0:00:00\n", + "Downloading numpy-2.3.1-cp312-cp312-win_amd64.whl (12.7 MB)\n", + " ---------------------------------------- 0.0/12.7 MB ? eta -:--:--\n", + " - -------------------------------------- 0.6/12.7 MB 12.4 MB/s eta 0:00:01\n", + " --- ------------------------------------ 1.3/12.7 MB 13.3 MB/s eta 0:00:01\n", + " ------ --------------------------------- 2.1/12.7 MB 15.1 MB/s eta 0:00:01\n", + " --------- ------------------------------ 3.0/12.7 MB 15.9 MB/s eta 0:00:01\n", + " ----------- ---------------------------- 3.6/12.7 MB 16.5 MB/s eta 0:00:01\n", + " ------------- -------------------------- 4.4/12.7 MB 15.6 MB/s eta 0:00:01\n", + " ---------------- ----------------------- 5.2/12.7 MB 16.0 MB/s eta 0:00:01\n", + " ----------------- ---------------------- 5.7/12.7 MB 15.2 MB/s eta 0:00:01\n", + " -------------------- ------------------- 6.6/12.7 MB 15.5 MB/s eta 0:00:01\n", + " ---------------------- ----------------- 7.3/12.7 MB 15.5 MB/s eta 0:00:01\n", + " ------------------------ --------------- 7.9/12.7 MB 15.2 MB/s eta 0:00:01\n", + " -------------------------- ------------- 8.5/12.7 MB 15.1 MB/s eta 0:00:01\n", + " ---------------------------- ----------- 9.1/12.7 MB 14.9 MB/s eta 0:00:01\n", + " ------------------------------ --------- 9.7/12.7 MB 14.8 MB/s eta 0:00:01\n", + " -------------------------------- ------- 10.3/12.7 MB 14.6 MB/s eta 0:00:01\n", + " ---------------------------------- ----- 10.9/12.7 MB 14.6 MB/s eta 0:00:01\n", + " ------------------------------------ --- 11.5/12.7 MB 14.9 MB/s eta 0:00:01\n", + " -------------------------------------- - 12.1/12.7 MB 14.2 MB/s eta 0:00:01\n", + " --------------------------------------- 12.7/12.7 MB 13.9 MB/s eta 0:00:01\n", + " --------------------------------------- 12.7/12.7 MB 13.9 MB/s eta 0:00:01\n", + " --------------------------------------- 12.7/12.7 MB 13.9 MB/s eta 0:00:01\n", + " --------------------------------------- 12.7/12.7 MB 13.9 MB/s eta 0:00:01\n", + " ---------------------------------------- 12.7/12.7 MB 11.1 MB/s eta 0:00:00\n", + "Downloading openai-1.97.0-py3-none-any.whl (764 kB)\n", + " ---------------------------------------- 0.0/765.0 kB ? eta -:--:--\n", + " -------------------------------------- 757.8/765.0 kB 24.1 MB/s eta 0:00:01\n", + " --------------------------------------- 765.0/765.0 kB 16.1 MB/s eta 0:00:00\n", + "Using cached pydantic-2.11.7-py3-none-any.whl (444 kB)\n", + "Downloading python_dotenv-1.1.1-py3-none-any.whl (20 kB)\n", + "Using cached redis-6.2.0-py3-none-any.whl (278 kB)\n", + "Downloading tiktoken-0.9.0-cp312-cp312-win_amd64.whl (894 kB)\n", + " ---------------------------------------- 0.0/894.9 kB ? eta -:--:--\n", + " --------------------------- ----------- 634.9/894.9 kB 20.1 MB/s eta 0:00:01\n", + " --------------------------------------- 894.9/894.9 kB 14.3 MB/s eta 0:00:00\n", + "Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)\n", + "Downloading zeroentropy-0.1.0a6-py3-none-any.whl (101 kB)\n", + " ---------------------------------------- 0.0/101.5 kB ? eta -:--:--\n", + " ---------------------------------------- 101.5/101.5 kB 6.1 MB/s eta 0:00:00\n", + "Downloading pydantic_core-2.33.2-cp312-cp312-win_amd64.whl (2.0 MB)\n", + " ---------------------------------------- 0.0/2.0 MB ? eta -:--:--\n", + " ------------------- -------------------- 0.9/2.0 MB 30.1 MB/s eta 0:00:01\n", + " --------------------------------------- 1.9/2.0 MB 20.4 MB/s eta 0:00:01\n", + " ---------------------------------------- 2.0/2.0 MB 15.6 MB/s eta 0:00:00\n", + "Using cached aiohappyeyeballs-2.6.1-py3-none-any.whl (15 kB)\n", + "Downloading aiosignal-1.4.0-py3-none-any.whl (7.5 kB)\n", + "Using cached annotated_types-0.7.0-py3-none-any.whl (13 kB)\n", + "Downloading anyio-4.10.0-py3-none-any.whl (107 kB)\n", + " ---------------------------------------- 0.0/107.2 kB ? eta -:--:--\n", + " ---------------------------------------- 107.2/107.2 kB ? eta 0:00:00\n", + "Using cached attrs-25.3.0-py3-none-any.whl (63 kB)\n", + "Downloading contourpy-1.3.3-cp312-cp312-win_amd64.whl (226 kB)\n", + " ---------------------------------------- 0.0/226.6 kB ? eta -:--:--\n", + " --------------------------------------- 226.6/226.6 kB 13.5 MB/s eta 0:00:00\n", + "Using cached cycler-0.12.1-py3-none-any.whl (8.3 kB)\n", + "Using cached dill-0.3.8-py3-none-any.whl (116 kB)\n", + "Using cached distro-1.9.0-py3-none-any.whl (20 kB)\n", + "Downloading fonttools-4.59.0-cp312-cp312-win_amd64.whl (2.2 MB)\n", + " ---------------------------------------- 0.0/2.2 MB ? eta -:--:--\n", + " --------------- ------------------------ 0.9/2.2 MB 18.5 MB/s eta 0:00:01\n", + " ------------------------------ --------- 1.7/2.2 MB 18.0 MB/s eta 0:00:01\n", + " --------------------------------------- 2.2/2.2 MB 17.9 MB/s eta 0:00:01\n", + " ---------------------------------------- 2.2/2.2 MB 14.3 MB/s eta 0:00:00\n", + "Downloading frozenlist-1.7.0-cp312-cp312-win_amd64.whl (43 kB)\n", + " ---------------------------------------- 0.0/43.9 kB ? eta -:--:--\n", + " ------------------------------------- -- 41.0/43.9 kB ? eta -:--:--\n", + " ------------------------------------- -- 41.0/43.9 kB ? eta -:--:--\n", + " ------------------------------------- -- 41.0/43.9 kB ? eta -:--:--\n", + " ------------------------------------- -- 41.0/43.9 kB ? eta -:--:--\n", + " ------------------------------------- -- 41.0/43.9 kB ? eta -:--:--\n", + " ------------------------------------- -- 41.0/43.9 kB ? eta -:--:--\n", + " ------------------------------------- -- 41.0/43.9 kB ? eta -:--:--\n", + " ------------------------------------- -- 41.0/43.9 kB ? eta -:--:--\n", + " ---------------------------------------- 43.9/43.9 kB 79.7 kB/s eta 0:00:00\n", + "Using cached httpx-0.28.1-py3-none-any.whl (73 kB)\n", + "Using cached httpcore-1.0.9-py3-none-any.whl (78 kB)\n", + "Downloading huggingface_hub-0.34.4-py3-none-any.whl (561 kB)\n", + " ---------------------------------------- 0.0/561.5 kB ? eta -:--:--\n", + " --------------------------------------- 561.5/561.5 kB 11.7 MB/s eta 0:00:00\n", + "Downloading jiter-0.10.0-cp312-cp312-win_amd64.whl (206 kB)\n", + " ---------------------------------------- 0.0/206.2 kB ? eta -:--:--\n", + " --------------------------------------- 206.2/206.2 kB 12.2 MB/s eta 0:00:00\n", + "Downloading jupyterlab_widgets-3.0.15-py3-none-any.whl (216 kB)\n", + " ---------------------------------------- 0.0/216.6 kB ? eta -:--:--\n", + " ---------------------------------------- 216.6/216.6 kB 6.7 MB/s eta 0:00:00\n", + "Downloading kiwisolver-1.4.9-cp312-cp312-win_amd64.whl (73 kB)\n", + " ---------------------------------------- 0.0/73.9 kB ? eta -:--:--\n", + " ---------------------------------------- 73.9/73.9 kB ? eta 0:00:00\n", + "Downloading multidict-6.6.4-cp312-cp312-win_amd64.whl (46 kB)\n", + " ---------------------------------------- 0.0/46.1 kB ? eta -:--:--\n", + " ---------------------------------------- 46.1/46.1 kB 2.2 MB/s eta 0:00:00\n", + "Downloading multiprocess-0.70.16-py312-none-any.whl (146 kB)\n", + " ---------------------------------------- 0.0/146.7 kB ? eta -:--:--\n", + " ---------------------------------------- 146.7/146.7 kB 8.5 MB/s eta 0:00:00\n", + "Downloading pillow-11.3.0-cp312-cp312-win_amd64.whl (7.0 MB)\n", + " ---------------------------------------- 0.0/7.0 MB ? eta -:--:--\n", + " ---- ----------------------------------- 0.7/7.0 MB 22.8 MB/s eta 0:00:01\n", + " -------- ------------------------------- 1.5/7.0 MB 18.6 MB/s eta 0:00:01\n", + " ------------- -------------------------- 2.3/7.0 MB 18.5 MB/s eta 0:00:01\n", + " ------------------ --------------------- 3.2/7.0 MB 18.5 MB/s eta 0:00:01\n", + " ---------------------- ----------------- 3.9/7.0 MB 17.8 MB/s eta 0:00:01\n", + " --------------------------- ------------ 4.7/7.0 MB 17.8 MB/s eta 0:00:01\n", + " ------------------------------ --------- 5.4/7.0 MB 17.3 MB/s eta 0:00:01\n", + " ---------------------------------- ----- 6.1/7.0 MB 16.9 MB/s eta 0:00:01\n", + " ------------------------------------- -- 6.6/7.0 MB 16.3 MB/s eta 0:00:01\n", + " --------------------------------------- 7.0/7.0 MB 16.0 MB/s eta 0:00:01\n", + " --------------------------------------- 7.0/7.0 MB 16.0 MB/s eta 0:00:01\n", + " --------------------------------------- 7.0/7.0 MB 16.0 MB/s eta 0:00:01\n", + " ---------------------------------------- 7.0/7.0 MB 12.4 MB/s eta 0:00:00\n", + "Downloading propcache-0.3.2-cp312-cp312-win_amd64.whl (41 kB)\n", + " ---------------------------------------- 0.0/41.5 kB ? eta -:--:--\n", + " ---------------------------------------- 41.5/41.5 kB 2.1 MB/s eta 0:00:00\n", + "Downloading pyarrow-21.0.0-cp312-cp312-win_amd64.whl (26.2 MB)\n", + " ---------------------------------------- 0.0/26.2 MB ? eta -:--:--\n", + " --------------------------------------- 0.5/26.2 MB 16.2 MB/s eta 0:00:02\n", + " - -------------------------------------- 1.2/26.2 MB 15.4 MB/s eta 0:00:02\n", + " --- ------------------------------------ 2.0/26.2 MB 16.1 MB/s eta 0:00:02\n", + " ---- ----------------------------------- 2.7/26.2 MB 15.8 MB/s eta 0:00:02\n", + " ----- ---------------------------------- 3.4/26.2 MB 15.4 MB/s eta 0:00:02\n", + " ------ --------------------------------- 4.0/26.2 MB 15.0 MB/s eta 0:00:02\n", + " ------- -------------------------------- 4.7/26.2 MB 14.9 MB/s eta 0:00:02\n", + " -------- ------------------------------- 5.4/26.2 MB 15.0 MB/s eta 0:00:02\n", + " --------- ------------------------------ 6.1/26.2 MB 15.1 MB/s eta 0:00:02\n", + " ---------- ----------------------------- 6.9/26.2 MB 15.2 MB/s eta 0:00:02\n", + " ----------- ---------------------------- 7.6/26.2 MB 15.3 MB/s eta 0:00:02\n", + " ------------ --------------------------- 8.3/26.2 MB 15.1 MB/s eta 0:00:02\n", + " ------------- -------------------------- 9.0/26.2 MB 15.2 MB/s eta 0:00:02\n", + " -------------- ------------------------- 9.7/26.2 MB 15.1 MB/s eta 0:00:02\n", + " --------------- ------------------------ 10.3/26.2 MB 14.9 MB/s eta 0:00:02\n", + " ---------------- ----------------------- 10.9/26.2 MB 14.9 MB/s eta 0:00:02\n", + " ----------------- ---------------------- 11.5/26.2 MB 14.6 MB/s eta 0:00:02\n", + " ------------------ --------------------- 12.1/26.2 MB 14.6 MB/s eta 0:00:01\n", + " ------------------- -------------------- 12.7/26.2 MB 14.2 MB/s eta 0:00:01\n", + " -------------------- ------------------- 13.3/26.2 MB 14.2 MB/s eta 0:00:01\n", + " --------------------- ------------------ 13.9/26.2 MB 14.2 MB/s eta 0:00:01\n", + " ---------------------- ----------------- 14.5/26.2 MB 13.9 MB/s eta 0:00:01\n", + " ----------------------- ---------------- 15.1/26.2 MB 14.2 MB/s eta 0:00:01\n", + " ------------------------ --------------- 15.7/26.2 MB 13.9 MB/s eta 0:00:01\n", + " ------------------------ --------------- 16.4/26.2 MB 13.6 MB/s eta 0:00:01\n", + " ------------------------- -------------- 16.9/26.2 MB 13.4 MB/s eta 0:00:01\n", + " -------------------------- ------------- 17.5/26.2 MB 13.4 MB/s eta 0:00:01\n", + " --------------------------- ------------ 18.1/26.2 MB 13.4 MB/s eta 0:00:01\n", + " ---------------------------- ----------- 18.8/26.2 MB 13.4 MB/s eta 0:00:01\n", + " ----------------------------- ---------- 19.3/26.2 MB 12.8 MB/s eta 0:00:01\n", + " ------------------------------ --------- 20.0/26.2 MB 12.8 MB/s eta 0:00:01\n", + " ------------------------------- -------- 20.5/26.2 MB 12.9 MB/s eta 0:00:01\n", + " -------------------------------- ------- 21.1/26.2 MB 12.8 MB/s eta 0:00:01\n", + " --------------------------------- ------ 21.7/26.2 MB 12.8 MB/s eta 0:00:01\n", + " ---------------------------------- ----- 22.4/26.2 MB 12.8 MB/s eta 0:00:01\n", + " ---------------------------------- ----- 22.9/26.2 MB 12.8 MB/s eta 0:00:01\n", + " ----------------------------------- ---- 23.5/26.2 MB 12.8 MB/s eta 0:00:01\n", + " ------------------------------------ --- 24.0/26.2 MB 12.6 MB/s eta 0:00:01\n", + " ------------------------------------- -- 24.6/26.2 MB 12.6 MB/s eta 0:00:01\n", + " -------------------------------------- - 25.1/26.2 MB 12.6 MB/s eta 0:00:01\n", + " --------------------------------------- 25.7/26.2 MB 12.6 MB/s eta 0:00:01\n", + " --------------------------------------- 26.2/26.2 MB 12.6 MB/s eta 0:00:01\n", + " --------------------------------------- 26.2/26.2 MB 12.6 MB/s eta 0:00:01\n", + " --------------------------------------- 26.2/26.2 MB 12.6 MB/s eta 0:00:01\n", + " --------------------------------------- 26.2/26.2 MB 12.6 MB/s eta 0:00:01\n", + " --------------------------------------- 26.2/26.2 MB 12.6 MB/s eta 0:00:01\n", + " --------------------------------------- 26.2/26.2 MB 12.6 MB/s eta 0:00:01\n", + " --------------------------------------- 26.2/26.2 MB 12.6 MB/s eta 0:00:01\n", + " --------------------------------------- 26.2/26.2 MB 12.6 MB/s eta 0:00:01\n", + " --------------------------------------- 26.2/26.2 MB 12.6 MB/s eta 0:00:01\n", + " --------------------------------------- 26.2/26.2 MB 12.6 MB/s eta 0:00:01\n", + " --------------------------------------- 26.2/26.2 MB 12.6 MB/s eta 0:00:01\n", + " --------------------------------------- 26.2/26.2 MB 12.6 MB/s eta 0:00:01\n", + " ---------------------------------------- 26.2/26.2 MB 7.4 MB/s eta 0:00:00\n", + "Using cached pyparsing-3.2.3-py3-none-any.whl (111 kB)\n", + "Downloading PyYAML-6.0.2-cp312-cp312-win_amd64.whl (156 kB)\n", + " ---------------------------------------- 0.0/156.3 kB ? eta -:--:--\n", + " ---------------------------------------- 156.3/156.3 kB 9.1 MB/s eta 0:00:00\n", + "Downloading regex-2025.7.34-cp312-cp312-win_amd64.whl (275 kB)\n", + " ---------------------------------------- 0.0/275.4 kB ? eta -:--:--\n", + " --------------------------------------- 275.4/275.4 kB 16.6 MB/s eta 0:00:00\n", + "Using cached requests-2.32.4-py3-none-any.whl (64 kB)\n", + "Using cached sniffio-1.3.1-py3-none-any.whl (10 kB)\n", + "Downloading typing_extensions-4.14.1-py3-none-any.whl (43 kB)\n", + " ---------------------------------------- 0.0/43.9 kB ? eta -:--:--\n", + " ---------------------------------------- 43.9/43.9 kB 2.2 MB/s eta 0:00:00\n", + "Using cached typing_inspection-0.4.1-py3-none-any.whl (14 kB)\n", + "Downloading widgetsnbextension-4.0.14-py3-none-any.whl (2.2 MB)\n", + " ---------------------------------------- 0.0/2.2 MB ? eta -:--:--\n", + " -------------- ------------------------- 0.8/2.2 MB 17.4 MB/s eta 0:00:01\n", + " ---------------------------- ----------- 1.6/2.2 MB 16.7 MB/s eta 0:00:01\n", + " --------------------------------------- 2.2/2.2 MB 17.4 MB/s eta 0:00:01\n", + " ---------------------------------------- 2.2/2.2 MB 12.8 MB/s eta 0:00:00\n", + "Using cached win32_setctime-1.2.0-py3-none-any.whl (4.1 kB)\n", + "Downloading yarl-1.20.1-cp312-cp312-win_amd64.whl (86 kB)\n", + " ---------------------------------------- 0.0/86.7 kB ? eta -:--:--\n", + " ---------------------------------------- 86.7/86.7 kB 5.1 MB/s eta 0:00:00\n", + "Using cached filelock-3.18.0-py3-none-any.whl (16 kB)\n", + "Downloading pandas-2.3.1-cp312-cp312-win_amd64.whl (11.0 MB)\n", + " ---------------------------------------- 0.0/11.0 MB ? eta -:--:--\n", + " - -------------------------------------- 0.5/11.0 MB 17.2 MB/s eta 0:00:01\n", + " ---- ----------------------------------- 1.2/11.0 MB 15.5 MB/s eta 0:00:01\n", + " ------ --------------------------------- 1.9/11.0 MB 14.9 MB/s eta 0:00:01\n", + " --------- ------------------------------ 2.6/11.0 MB 15.2 MB/s eta 0:00:01\n", + " ------------ --------------------------- 3.3/11.0 MB 15.3 MB/s eta 0:00:01\n", + " --------------- ------------------------ 4.2/11.0 MB 15.7 MB/s eta 0:00:01\n", + " ------------------ --------------------- 5.0/11.0 MB 16.1 MB/s eta 0:00:01\n", + " --------------------- ------------------ 6.0/11.0 MB 16.7 MB/s eta 0:00:01\n", + " ------------------------ --------------- 6.7/11.0 MB 16.4 MB/s eta 0:00:01\n", + " -------------------------- ------------- 7.3/11.0 MB 16.0 MB/s eta 0:00:01\n", + " ---------------------------- ----------- 7.9/11.0 MB 15.7 MB/s eta 0:00:01\n", + " ------------------------------ --------- 8.5/11.0 MB 15.6 MB/s eta 0:00:01\n", + " --------------------------------- ------ 9.1/11.0 MB 15.4 MB/s eta 0:00:01\n", + " ----------------------------------- ---- 9.7/11.0 MB 15.2 MB/s eta 0:00:01\n", + " ------------------------------------- -- 10.3/11.0 MB 14.9 MB/s eta 0:00:01\n", + " --------------------------------------- 10.9/11.0 MB 14.9 MB/s eta 0:00:01\n", + " --------------------------------------- 11.0/11.0 MB 14.9 MB/s eta 0:00:01\n", + " --------------------------------------- 11.0/11.0 MB 14.9 MB/s eta 0:00:01\n", + " --------------------------------------- 11.0/11.0 MB 14.9 MB/s eta 0:00:01\n", + " --------------------------------------- 11.0/11.0 MB 14.9 MB/s eta 0:00:01\n", + " --------------------------------------- 11.0/11.0 MB 14.9 MB/s eta 0:00:01\n", + " --------------------------------------- 11.0/11.0 MB 14.9 MB/s eta 0:00:01\n", + " ---------------------------------------- 11.0/11.0 MB 10.4 MB/s eta 0:00:00\n", + "Downloading xxhash-3.5.0-cp312-cp312-win_amd64.whl (30 kB)\n", + "Downloading certifi-2025.8.3-py3-none-any.whl (161 kB)\n", + " ---------------------------------------- 0.0/161.2 kB ? eta -:--:--\n", + " ---------------------------------------- 161.2/161.2 kB ? eta 0:00:00\n", + "Downloading charset_normalizer-3.4.3-cp312-cp312-win_amd64.whl (107 kB)\n", + " ---------------------------------------- 0.0/107.5 kB ? eta -:--:--\n", + " ---------------------------------------- 107.5/107.5 kB 6.1 MB/s eta 0:00:00\n", + "Downloading fsspec-2025.3.0-py3-none-any.whl (193 kB)\n", + " ---------------------------------------- 0.0/193.6 kB ? eta -:--:--\n", + " --------------------------------------- 193.6/193.6 kB 12.2 MB/s eta 0:00:00\n", + "Using cached idna-3.10-py3-none-any.whl (70 kB)\n", + "Using cached pytz-2025.2-py2.py3-none-any.whl (509 kB)\n", + "Using cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)\n", + "Using cached urllib3-2.5.0-py3-none-any.whl (129 kB)\n", + "Using cached h11-0.16.0-py3-none-any.whl (37 kB)\n", + "Building wheels for collected packages: zbench, openlimit\n", + " Building editable for zbench (pyproject.toml): started\n", + " Building editable for zbench (pyproject.toml): finished with status 'done'\n", + " Created wheel for zbench: filename=zbench-0.1.0-0.editable-py3-none-any.whl size=2893 sha256=906a3df7fa2efad94ef8b4b8f6b81c290ccf161c9be152cfb9ea651f26b69f1a\n", + " Stored in directory: C:\\Users\\polasani rohit\\AppData\\Local\\Temp\\pip-ephem-wheel-cache-hbmoyqiy\\wheels\\56\\00\\bc\\493af3709223295cdc95c3c65cd90ffb8f258d3345567413c1\n", + " Building wheel for openlimit (pyproject.toml): started\n", + " Building wheel for openlimit (pyproject.toml): finished with status 'done'\n", + " Created wheel for openlimit: filename=openlimit-0.3.0-py3-none-any.whl size=13202 sha256=85f8d28951bed66d6f266fecb2a9fbea9e32da780318f77730ab5eb25e2bf1da\n", + " Stored in directory: C:\\Users\\polasani rohit\\AppData\\Local\\Temp\\pip-ephem-wheel-cache-hbmoyqiy\\wheels\\ae\\76\\04\\ad32d4268ee00aaee992ccfa8dc7042f54f98d8799b9a4fe65\n", + "Successfully built zbench openlimit\n", + "Installing collected packages: pytz, xxhash, win32-setctime, widgetsnbextension, urllib3, tzdata, typing-extensions, tqdm, sniffio, regex, redis, pyyaml, python-dotenv, pyparsing, pyarrow, propcache, pillow, numpy, multidict, kiwisolver, jupyterlab_widgets, jiter, idna, h11, fsspec, frozenlist, fonttools, filelock, distro, dill, cycler, charset_normalizer, certifi, attrs, annotated-types, aiohappyeyeballs, yarl, typing-inspection, requests, pydantic-core, pandas, multiprocess, loguru, httpcore, contourpy, anyio, aiosignal, tiktoken, pydantic, matplotlib, ipywidgets, ipykernel, huggingface-hub, httpx, aiohttp, zeroentropy, openlimit, openai, anthropic, datasets, zbench\n", + " Attempting uninstall: ipykernel\n", + " Found existing installation: ipykernel 6.30.1\n", + " Uninstalling ipykernel-6.30.1:\n", + " Successfully uninstalled ipykernel-6.30.1\n", + "Successfully installed aiohappyeyeballs-2.6.1 aiohttp-3.12.14 aiosignal-1.4.0 annotated-types-0.7.0 anthropic-0.57.1 anyio-4.10.0 attrs-25.3.0 certifi-2025.8.3 charset_normalizer-3.4.3 contourpy-1.3.3 cycler-0.12.1 datasets-4.0.0 dill-0.3.8 distro-1.9.0 filelock-3.18.0 fonttools-4.59.0 frozenlist-1.7.0 fsspec-2025.3.0 h11-0.16.0 httpcore-1.0.9 httpx-0.28.1 huggingface-hub-0.34.4 idna-3.10 ipykernel-6.30.0 ipywidgets-8.1.7 jiter-0.10.0 jupyterlab_widgets-3.0.15 kiwisolver-1.4.9 loguru-0.7.3 matplotlib-3.10.3 multidict-6.6.4 multiprocess-0.70.16 numpy-2.3.1 openai-1.97.0 openlimit-0.3.0 pandas-2.3.1 pillow-11.3.0 propcache-0.3.2 pyarrow-21.0.0 pydantic-2.11.7 pydantic-core-2.33.2 pyparsing-3.2.3 python-dotenv-1.1.1 pytz-2025.2 pyyaml-6.0.2 redis-6.2.0 regex-2025.7.34 requests-2.32.4 sniffio-1.3.1 tiktoken-0.9.0 tqdm-4.67.1 typing-extensions-4.14.1 typing-inspection-0.4.1 tzdata-2025.2 urllib3-2.5.0 widgetsnbextension-4.0.14 win32-setctime-1.2.0 xxhash-3.5.0 yarl-1.20.1 zbench-0.1.0 zeroentropy-0.1.0a6\n", + "Note: you may need to restart the kernel to use updated packages.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Creating pairs: 100%|██████████| 1/1 [00:00<00:00, 468.17it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Created 12 pairs and saved to data/annotation/my_sample_dataset/pairs.json\n", - "Step 3: Scoring pairs\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Scoring Pairs: 100%|██████████| 12/12 [00:27<00:00, 2.26s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Scored 12 pairs and saved to data/annotation/my_sample_dataset/ai_scores.json\n", - "Step 4: Composing annotated dataset\n", - "Created 1 annotated lines\n", - "Saved annotated dataset to ./data/my_sample_dataset_zelo_annotated.jsonl\n", + " Running command git clone --filter=blob:none --quiet https://github.com/shobrook/openlimit.git 'C:\\Users\\polasani rohit\\AppData\\Local\\Temp\\pip-install-x2voovyh\\openlimit_f3705ea5465049dbbc8500a67df63878'\n", + " WARNING: The script tqdm.exe is installed in 'c:\\Users\\polasani rohit\\AppData\\Local\\Programs\\Python\\Python312\\Scripts' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n", + " WARNING: The script dotenv.exe is installed in 'c:\\Users\\polasani rohit\\AppData\\Local\\Programs\\Python\\Python312\\Scripts' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n", + " WARNING: The scripts f2py.exe and numpy-config.exe are installed in 'c:\\Users\\polasani rohit\\AppData\\Local\\Programs\\Python\\Python312\\Scripts' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n", + " WARNING: The scripts fonttools.exe, pyftmerge.exe, pyftsubset.exe and ttx.exe are installed in 'c:\\Users\\polasani rohit\\AppData\\Local\\Programs\\Python\\Python312\\Scripts' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n", + " WARNING: The script distro.exe is installed in 'c:\\Users\\polasani rohit\\AppData\\Local\\Programs\\Python\\Python312\\Scripts' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n", + " WARNING: The script normalizer.exe is installed in 'c:\\Users\\polasani rohit\\AppData\\Local\\Programs\\Python\\Python312\\Scripts' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n", + " WARNING: The scripts hf.exe, huggingface-cli.exe and tiny-agents.exe are installed in 'c:\\Users\\polasani rohit\\AppData\\Local\\Programs\\Python\\Python312\\Scripts' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n", + " WARNING: The script httpx.exe is installed in 'c:\\Users\\polasani rohit\\AppData\\Local\\Programs\\Python\\Python312\\Scripts' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n", + " WARNING: The script openai.exe is installed in 'c:\\Users\\polasani rohit\\AppData\\Local\\Programs\\Python\\Python312\\Scripts' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n", + " WARNING: The script datasets-cli.exe is installed in 'c:\\Users\\polasani rohit\\AppData\\Local\\Programs\\Python\\Python312\\Scripts' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n", "\n", - "🎉 Annotation completed successfully!\n", - "Final outputs:\n", - "Annotated dataset: ./data/my_sample_dataset_zelo_annotated.jsonl\n" + "[notice] A new release of pip is available: 23.2.1 -> 25.2\n", + "[notice] To update, run: python.exe -m pip install --upgrade pip\n" ] - }, + } + ], + "source": [ + "pip install -e ." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" + "ename": "ModuleNotFoundError", + "evalue": "No module named 'fcntl'", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 4\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# At first, we need to Ensemble annotate the dataset.\u001b[39;00m\n\u001b[32m 2\u001b[39m \u001b[38;5;66;03m# We can do this by running the following code:\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m4\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mzbench\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mannotation\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m EnsembleZELOAnnotator\n\u001b[32m 6\u001b[39m \u001b[38;5;66;03m# We'll initialize an ELO annotator that will read from our example dataset, and output an annotated dataset.\u001b[39;00m\n\u001b[32m 7\u001b[39m SOURCE_DATASET_PATH = \u001b[33m\"\u001b[39m\u001b[33mjob_resume_data.jsonl\u001b[39m\u001b[33m\"\u001b[39m\n", + "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\polasani rohit\\OneDrive\\Desktop\\zbench\\zbench\\annotation.py:10\u001b[39m\n\u001b[32m 7\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mrandom\u001b[39;00m\n\u001b[32m 8\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mshutil\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m copy2\n\u001b[32m---> \u001b[39m\u001b[32m10\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mzbench\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mutils\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[32m 11\u001b[39m load_jsonl, load_json, save_pydantic_jsonl, save_pydantic_json, calculate_elos\n\u001b[32m 12\u001b[39m )\n\u001b[32m 13\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mzbench\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mcommon_types\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m DatasetPairs, DatasetPair, Dataset, AnnotatedDataset, AnnotatedQueryDocuments, AnnotatedDocument, DatasetPairScoredPairs, DatasetPairDocument\n\u001b[32m 14\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mzbench\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mscore\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Score\n", + "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\polasani rohit\\OneDrive\\Desktop\\zbench\\zbench\\utils.py:2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01masyncio\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mfcntl\u001b[39;00m\n\u001b[32m 3\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mhashlib\u001b[39;00m\n\u001b[32m 4\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mjson\u001b[39;00m\n", + "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'fcntl'" ] } ], @@ -118,6 +655,7 @@ "from zbench.annotation import EnsembleZELOAnnotator\n", "\n", "# We'll initialize an ELO annotator that will read from our example dataset, and output an annotated dataset.\n", + "SOURCE_DATASET_PATH = \"job_resume_data.jsonl\"\n", "ANNOTATED_DATASET_PATH = \"./data/my_sample_dataset_zelo_annotated.jsonl\"\n", "annotator = EnsembleZELOAnnotator(SOURCE_DATASET_PATH, ANNOTATED_DATASET_PATH)\n", "\n", @@ -127,26 +665,15 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [ { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAA1kAAAIjCAYAAADxz9EgAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjMsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvZiW1igAAAAlwSFlzAAAPYQAAD2EBqD+naQAAU/xJREFUeJzt3QmczWX///HPMGasYx37Msi+hkhIQtxJ5O7OViSRpEISJRJZsmRJSdm6W4hK7hYV0YKSpSj7ftvGWIexDDPf/+Nz/f7n3DPjzOC4zmLm9Xx0Guc73+/3XOfMNTPnPdd1fb4hjuM4AgAAAACwIpOd0wAAAAAAFCELAAAAACwiZAEAAACARYQsAAAAALCIkAUAAAAAFhGyAAAAAMAiQhYAAAAAWETIAgAAAACLCFkAAAAAYBEhCwB87JVXXpGQkBC/PNZdd91lbi4rVqwwj71w4UK/PP6jjz4qUVFREszOnj0rjz/+uBQuXNi8Nn379g10kwAA6QwhCwCuw5w5c8wbc9cta9asUrRoUWnRooVMmTJFzpw5Y+VxDh06ZMLZH3/8IcEmmNt2LUaNGmW+jk8++aT8+9//lkceeSTVfTUw6tf56aefvuJzngLsjfQPfT0ffvhhKVGihISHh0u+fPmkWbNmMnv2bElISEi278WLF2Xq1KnSsGFDyZs3r4SFhZnHuf/+++Xjjz++Yn9P4uPjZfLkyXLrrbdKRESE5MmTR6pUqSI9e/aUrVu3XvV4AEDqQtP4HAAgFa+++qqULl1aLl26JEeOHDFvuHVEZOLEibJ48WKpXr26e98hQ4bIoEGDrjvIDB8+3LzJr1mz5jUf991334mvpdW2d999VxITEyWY/fDDD3L77bfLsGHDrvkYfV6DBw82QcZ2/1Dvvfee9OrVSwoVKmRCX7ly5UwgW7ZsmXTv3l0OHz4sL774otk3JiZG/vGPf8i6detMeNP+pYFMH2fp0qXSqVMn2blzp7z88stptvGf//ynfPPNN9KxY0fp0aOHaauGqy+//FLuuOMOqVix4jW/PgCA5AhZAOAFfZNbp04d9319A65v3u+77z4zmrBlyxbJli2b+VxoaKi5+dK5c+cke/bsZkQjkLJkySLB7ujRo1K5cuVr3l9Hd7Zt2yZjxowxo1G2+8evv/5qAlb9+vXl66+/lly5crmP02C2du1a+euvv9zbNIRt2LBBPv30U2nXrl2yx9XH0f21vWn5/fffTZh67bXX3OHN5c0335RTp06Jv1y4cMH020yZmFwDIP3gJxoAWHL33Xeb0YN9+/bJBx98kOaarO+//95M9dIpWjlz5pQKFSq43+zqqMdtt91m/t2tWzf31DOdiqZ0zVXVqlXNSMadd95pwpXr2JRrslx0+pjuo+uQcuTIYd7o//e//022j45M6ZqqlJKe82pt87QmKy4uTp577jn3NDh9ruPHjxfHcZLtp+fp06ePLFq0yDw/3VcDzpIlS645POmoj44G6TS9GjVqyNy5c6+Y3rdnzx756quv3G3fu3dvmufV59OlSxczmqWjeLb7h44Kajs+/PDDZAHLRcOa6+uyevVq+fbbb82UvpQBK+n+nTt3TrMtu3btMh8bNGhwxecyZ84s+fPnT7bt4MGD5rXVkTz9uugonU631CmHLrt375Z//etfZlRN+6SOFurrnJTrazBv3jwzAlesWDGzb2xsrPn8b7/9Ji1btpTcuXOb7Y0bN5aVK1cmO4eO8Gn41K+LtqVgwYLSvHlzWb9+fZrPGQD8iZAFABa51vekNW3v77//NiMauq5Gp5VNmDDBhB7Xm8lKlSqZ7UrfTOu6Ib1poHI5fvy4GS3R6XqTJk2SJk2apNkuHbHQN7wvvPCCPPPMMybk6Xqf8+fPX9fzu5a2JaVBSp/bG2+8Yd4863Q5DVnPP/+89O/f/4r9f/nlF+ndu7d06NBBXn/9dTPKodPa9PmmRZ+HBkFtiwaMcePGmTfqGk503ZGr7fr5AgUKmNfN1fbIyMirPu+XXnpJLl++bEazbPYPHYHUKYH6+pUsWfKqx//nP/8xH3Xt1o0oVaqU+ajBTp9XWjRY1q1b1wSj9u3bm9E8fR4//vijab+Kjo42Uww1AOrXT/ubfu30a//5559fcc4RI0aY/jhgwACzRk5HsnSkT18HDVw6lVO364iahtM1a9a4j9VRv7ffftv0i7feesucQ0cFdXQQAIKGAwC4ZrNnz9bhF+f3339PdZ/cuXM7t956q/v+sGHDzDEub7zxhrkfExOT6jn0/LqPPl5KjRs3Np+bPn26x8/pzWX58uVm32LFijmxsbHu7Z988onZPnnyZPe2UqVKOV27dr3qOdNqmx6v53FZtGiR2XfkyJHJ9nvwwQedkJAQZ+fOne5tul9YWFiybX/++afZPnXqVCctkyZNMvt98MEH7m3x8fFO/fr1nZw5cyZ77tq+Vq1apXk+T/t269bNyZo1q3Po0KFkr+2CBQu87h+u5/fss89eU3seeOABs/+pU6eSbT9//rzpT67byZMn0zxPYmKiux8VKlTI6dixozNt2jRn3759V+zbpUsXJ1OmTB6fk55H9e3b15zr559/dn/uzJkzTunSpZ2oqCgnISEh2WtWpkwZ59y5c8nOU65cOadFixbucyrdR8/RvHnzZK/fU089dU2vFwAECiNZAGCZTv9Lq4qcThFUX3zxhddFInSalE7Xu1Y63S3pVLQHH3xQihQpYtYA+ZKeX6ef6ehZUjp9UHOVFl5ISkfXypYt676vBSK08p1ORbva4+hUSC3ikHR9mD6ulmzXUZcbpdPbbIxmJe0frmlynqYJeuLaX8+R1PTp082InOumU1HTolP2dNRp5MiRpjqhViR86qmnzAiXjla51mRp/9Tpm61bt062xizpeVyvv452JX1cbaOOdup0zM2bNyc7rmvXru41aa7Kijt27DBFO3TU8tixY+amU02bNm0qP/30k/t7Rb9/dFrhjUzdBABfI2QBgGX6pj6tN836JlbXwui1mnT9kE6N++STT64rcOlaluspcqHV6lK+Ob7llluuuh7pRun6I13Hk/L10Kl7rs8n5WnKnIaAkydPXvVx9DmmLJ6Q2uN4o0yZMmaa3IwZM0y1Pxv9QwOkutbS/67j9BxJ6dQ5nQKqt5SVC9MK6joNUqfZaWDRoKXrqLQv6to4VyVDDXa6Ri4t+vrqNNCUUnv9dU1XUhqwXOEraVjUm1Ze1Km1p0+fNvvoNFItBKJr/DTY6ZrHq4VwAPA3QhYAWHTgwAHzZlADTGr0L/j6l3ktt61v2jdu3GiCly7ev5brG7nOYVtqF0y+1jbZoKNenqQskhEorrVZY8eOtdI/9KNWnty0adM1He8qq5602qDSwKGjgHrTUHq9dFRTw772Sw2rGrSutlbrRqTsv64/MOhaOldYTHlzjd499NBDJlTpdcI0wOsxWiAl5agoAAQSIQsALNJCCkqvX5QWHXHRaVBaCEKnUmmhAF34v3z58jQDj7dcIwVJQ4teSylpJUB9c+6pdHfKUYjraZtOP9NRkpQjNa6L3boKMNwoPY8+x5SjgbYfR6cyatGJd955x6vRrJT9QyvoaWEHDTcpqz16ogVTXAUrfEGnWOpImF4zS6fr6UiSjralDHUp6evrqWz8tb7+rimi+liusJjylvTyABoKtcCGTmXUapFaDVG/hwAgWBCyAMASDUlaNU2nQqVVQvvEiRNXbHNd1FenRSkts65sXa/o/fffTxZ0Fi5caEKCVihM+kZXr9mUtCy3Xksp5Zv/62nbvffea0bC9NpLSWm1QQ1rSR//Rujj6MV458+f796mIzE62qEjIFoK3BZdm6UhRKet2egfWklPQ6+OaqacBqi0VL+rFL1OM9URT52yqGv6vB3100C6f//+K7br11TLxGvg1oClfwxo27atqWqo199K7bH09dcKgHqsi66n0nZqkL/adclq165t+p+W9vf0Gui0RaV9yTVt0EVLuOuIlut7BwCCARcjBgAv6NQk/Su9vpHX8tX6BlqnNOlf7BcvXmyu05QaLYGuIxetWrUy++v1nbQUdfHixd2FA/QNpy7w14IGug5Hg029evWuWMtyrfTaRXpuLZah7dWy7zpVrUePHu59dI2Yhi8tta5TsvRaSno9p6SFKK63bVowQcvL6zQ7Xf+l167S8uUaEPRaRynP7S0tsKCjS1qyXUOJvrHX56Jl8fW5XmthiesZzUp6Da4b6R9a+nzatGlmZEanA2rY0il7Gor1ulK6vxaocNGviX6NNPxoSHVNEdSQqVNQtW9dLbz++eefpsiE7teoUSPTP/RaWPqcdORRXzPX1E0tpa5fMw2q+jrrOisN6AsWLDAl97UvDBo0yKzp0vNpsRE9n55LR5n0oslXu9Cwfl7XXunxOvVP+6muO9Q26eiujnBp0NPXRL9PtHCL9iUN0Pqc9eLKeikEAAgaAatrCAA3IVeJbtdNS44XLlzYlJjWcuhJS4WnVsJ92bJlTps2bZyiRYua4/WjltDevn17suO++OILp3Llyk5oaGiykulaertKlSoe25daCfePP/7YGTx4sFOwYEEnW7Zspiy5p3LdEyZMMOXew8PDnQYNGjhr16694pxptS1lCXdXKe9+/fqZ55klSxZTqnvcuHHJSnUrPY+n0typlZZPKTo62pRZL1CggHldq1Wr5rHMvLcl3JPasWOHkzlz5lRLuF9P/3BZt26d06lTJ/frlDdvXqdp06bO3Llz3SXQk5Zs17L1WqI+IiLCfB30ce677z7nww8/dC5fvnzV12rMmDHm61qkSBFzvD7e3Xff7SxcuPCK/bWvaCn3yMhI0ze0BLt+rS5evOjeZ9euXaY0f548eUyp+7p16zpffvllsvN4Knuf1IYNG5x27do5+fPnN4+jr/9DDz1kvmeUPt7zzz/v1KhRw8mVK5eTI0cO8++33norzecLAP4Wov8LdNADAAAAgPSCNVkAAAAAYBEhCwAAAAAsImQBAAAAgEWELAAAAACwiJAFAAAAABYRsgAAAADAogx3MeLExERzoUW9MGVISEigmwMAAAAgQPRqVnqh86JFi171wunXI8OFLA1YJUqUCHQzAAAAAASJ//73v1K8eHFr58twIUtHsNS+ffskT548gW4O0vmoaUxMjERGRlr9ywiQEn0N6aqvVawocviwSJEiIlu3+uYxEPT4uQZ/OXXqlJQqVcqdEWzJcCHLNUUwIiLC3ABf/oK4cOGC6Wf8goAv0deQrvqa67z6kd/TGRY/1+DPvqZsLyOi1wIAAACARYQsAAAAALCIkAUAAAAAFmW4NVkAACCI/f67SEKCSObMgW4JgrDU9uXLlyVB+wdwHbJkySKZ/fwzhZAFAACCh1YVBFKIj4+Xw4cPy7lz5wLdFNyEQkJCTHn2nDlz+u0xCVkAAAAI6upve/bsMSMResHYsLAw65XgkL5HQGNiYuTAgQNSrlw5v41oEbIAAAAQ1KNYGrRKlCgh2bNnD3RzcBOKjIyUvXv3yqVLlwhZAAAgA5oxQ+TsWRGd1tOzZ6BbgyDC9bLgrUCMfBKyAABA8Hj1VZGDB0WKFSNkAbhpEbIAAABwU9K1NrGxsX55rIiICDPtDLgWhCwAAADclAHr4W6Py4kz/qk4mC9Xdvlg9nsELVwTQhYAAABuOjqCpQErsv4/JUe+Qj59rLgT0RKz+lPzmNcTsh599FGZO3eujB49WgYNGuTevmjRInnggQdM5Tu1YsUKadKkiXv9UK5cuaRMmTLSvHlz6devnxRJcWkDbcfYsWPl008/NQUd8uTJI1WrVpXevXub87rWIO3cuVNGjRolS5culejoaClQoIBUrFhRHnvsMWnfvr2EhoamGmCHDh0qX331lTkub968UqNGDbOtQYMGXr2GGQ0hCwAAADctDVgRBYv7/HFivDwua9asJhA98cQTJqykZdu2bWZaooao9evXy+uvvy4zZ840IaxatWpmn1OnTknDhg3l9OnTMnLkSLnttttMWPrxxx9l4MCBcvfdd5vQtWbNGmnWrJlUqVJFpk2bZsKVWrt2rbmvoUyDkyf//Oc/TVVHDYga9jRoLVu2TI4fPy6+Eh8fb8rzpxeUaQEAAAB8RINO4cKFzWjW1RQsWNDsW758eenQoYOsXLnSjJw9+eST7n1efPFFM3r122+/SdeuXaVy5cpm/x49esgff/xhLrirI2Q6iqbb9RytW7c214jSW8eOHeWXX36R6tWre2yDhriff/7ZBEMdXStVqpTUrVtXBg8eLPfff3+y/TQ4FipUyARJDW1ffvml+/OffvqpCXjh4eESFRUlEyZMSPY4um3EiBHSpUsXEyx7/v9CN9q2Ro0aSbZs2UzZ/meeeUbi4uLcx7311lvmeehj6mM/+OCDEowIWQAAAICP6HWZdMre1KlTzQVxr4cGjV69epmgdPToUXO9sHnz5knnzp3NhZlT0oClo1oatrZs2SIDBgxItfR9amXN9Rx60ymNFy9e9LiPtuMf//iHadcHH3wgmzdvljFjxrivQbVu3Tp56KGHTFDctGmTvPLKK/Lyyy/LnDlzkp1n/PjxZjRtw4YN5vO7du2Sli1bmpG0jRs3yvz5803o6tOnj3sUTkPXq6++akb9lixZInfeeacEo4CGrJ9++skka+0k+oXWL+bV6HBprVq1TCq+5ZZbrvhiAQAAAMFE10nVrFlThg0bdt3Huqb56ejVsWPH5OTJk+5tqdm+fbv5WKFCBfc2DWmuAKU3HRHyREOavr/WqYI67VDXYOnomYYeF13jpdMRP/vsM7NuTKcU3nfffSZ4qYkTJ0rTpk1NcNLRNB1V06A0bty4ZI+lUxufe+45KVu2rLnpaJ8GyL59+5rRqjvuuEOmTJki77//vly4cEH2798vOXLkMI+lI2y33nqrCV3BKKAhS4f+NL3qvNBrsWfPHmnVqpUZutSErl+Axx9/XL799luftxUAAADwlk6/0+CiI0zXw1UcQwckXP/2Rv78+c37Z71peNI1UKnRkaRDhw7J4sWLzciSa5DDNbih5yhevLgJUJ5s2bLligIZen/Hjh2SkJDg3lanTp1k+/z555/mMZKGwRYtWpiRM80BGug0XGmoe+SRR+TDDz+Uc+f8U13ypgpZmnZ1wZ6m+2sxffp0KV26tJnTWalSJZOIdR7mG2+84fO2AgAAP9A3bZUr/99HIB3RaW0aGHRt0/VwhTJdw6TrszQgbd26Nc1jdBRI6ZQ6F53Kp7PA9JZaVcGkdM2ThhodjVq1apUZjXKNxOk0Rhty5MiR7P7Zs2fNOi9XGNSbBi8NZzrSpVUXtSDIxx9/bCouarVDHbDR9WHB5qaqLrh69WqzeDAp7aw6opUanUuadD6p64J1moj1BviK9i/9ixP9DL5GX0O66mtLl5opUeb39c6dPr+4rJa0RnD3Nde/XTcX8+//+8/cfMmcXx8rRRuu+fj/f4xOh9Mpbq4RINf2pB+Tnv/8+fMyY8YME9BcfVVLr+s6KA0YKddlaUjRcKRTE3VKoa55+te//uVxXdb1Phcd4NClPXqMVjrU9WUa4jyNZlWqVMms10p6fl1bpftqW1J7vjpapuu7NFCl9jpqWNSpiHrT10ArNmrlw3bt2qXadtfjeHr/76ufZzdVyDpy5IipIpKU3tcfxNoJPaVq7czDhw/3WP8/rWFS4EbpN62WV9Vv6tQWnQI20NeQnvqann/C5Kly9rznBfc25cwWLs89+7Tkzp3b548F7/uaTi/T+5cvXzY3F/23fv7s8cPiOIk+v06WPlbKNlyN60296xgNH1rdT4tgKNd21xQ6naKnQenMmTNmxEZnb+kfHbQAhGtffV+r0/duv/128+/atWtLlixZTKjRku866qSjXe+++66ZNabT9LS0u4auS5cumbCj74N1+qGn56Jl2rWNWrlQw5SOHmkhC11PpbUU9Bg9p1YA1GmFul1DkQYuPacOgDz77LNSv3590z4Neb/++qtZHqTPO+ljJn1tVP/+/c15n3rqKenWrZsZ6dKRPA1RkydPNtft0mmDWsJew5UWvtBz6OOn9XXRz+l++tz0tUpK+5lk9JDlDR2S1S+YiwYyLQfpGm4FfEW/mfWHjfY13vjCl+hrSE99Td9g/rF5u0Te3s6nF5jVN807f/3M/FVcy2YjePua/lFcQ4dOcUs6zS1fvnySP3cOOb76c/Hd1Zv+Rx9LH/Naptq56PeJ3pIeo2XLFyxYYP7t2u6qyqdl0PV561ok18WI9X2slnV30f6qoUWr+elt3759JnBoINKQpWuv9BwahLQanw44aOjRwQoNLTq9TgtT6AWJPT0XfX9cr149E4i02p8GM33vrHUQtACG6xgt0a7VC3VtlNZZ0GmI+lj6+dtuu80EQ51eqJUVdWqfBi59zJSvT9I26EiWBsghQ4aYohgabDVAaaVC3U+f26RJk8xrqIUwdFrkRx99lOr1vlz0WH0sPV5H+pLy1bW5QpwbWUFnkXaGzz//XNq2bZvqPjpUqi++vrgus2fPNtMFrzWFasjSv1hpZRZCFnz9C0Ir+egPQ974wpfoa0hPfU3f1HV4rJdEtert0wvMxh49IHu/ekvmzZqe6tQkBEdf05Cloxe6Lj/lG2QdkXEtBfE1nV6qoQ83nwsXLqTah3Q9l4ZUzRL6Nc6QI1k67Pj1118n2/b999+b7QAA4OZXsH9/mbJpvVyOfUPmPJv84qVAShp6CD4IRgH9k6eZEvD/K4coTZj6b62B75rqp1eBdtGLse3evdvMK9WqKlrf/5NPPpF+/foF7DkAAAB7sv32m9Q/eUIq7/470E0BgJszZOk8Ua2wojelc07131opRB0+fNgduJQO8emCNx290rmXuhjwvffeMwvsAAAAACAYBHS64F133ZVm6UjXBc9SHrNhwwYftwwAAAAAvMMKaQAAAACwiJAFAAAAABYRsgAAAADAIkIWAAAAAFhEyAIAAAAAiwhZAAAAAGARIQsAAASN2Pbt5cNiJWVp3eaBbgpwwx599FEJCQmRXr16XfG5p556ynxO9wlGepklvXZtkSJFJFu2bNKsWTPZsWPHVY87ePCgPPzww5I/f35zXLVq1cy1cV1eeeUVqVixouTIkUPy5s1rzvvbb78lO8drr70md9xxh2TPnl3y5Mnj8TJP+tp5uh09elSCASELAAAEjZPPPCOTypaXhc3bB7opgBUlSpSQefPmyfnz593bLly4IB999JGULFlSgtXrr78uU6ZMkenTp5sQpKGoRYsWpu2pOXnypDRo0ECyZMki33zzjWzevFkmTJhgwpRL+fLl5c0335RNmzbJL7/8IlFRUXLPPfdITEyMe5/4+Hj517/+JU8++aTHx2nfvr0cPnw42U3b1rhxYylYsKAEA0IWAAAA4CO1atUyQeuzzz5zb9N/a8C69dZbk+2bmJgoo0ePltKlS5tRoBo1asjChQvdn09ISJDu3bu7P1+hQgWZPHlysnPoyFjbtm1l/PjxZhRKR5R01OzSpUvXNYo1adIkGTJkiLRp00aqV68u77//vhw6dEgWLVqU6nFjx441z3X27NlSt25d004NUGXLlnXv06lTJzN6VaZMGalSpYpMnDhRYmNjZePGje59hg8fLv369TOjYJ7ocy9cuLD7ljlzZvnhhx/MaxMsCFkAAAC4OU2cKFK8+NVv999/5bG67VqO1ce4QY899pgJHi6zZs2Sbt26XbGfBiwNMzp69Pfff5ugoVPvfvzxR3cIK168uCxYsMCMEul0vhdffFE++eSTZOdZvny57Nq1y3ycO3eumV6nt6RT9nQEKTV79uyRI0eOmDDkkjt3bqlXr56sXr061eMWL14sderUMaNQOqKkIfLdd99NdX8dsZoxY4Y5twZKb+lrplMLH3zwQQkWoYFuAAAAAOCV2FhdBHT1/UqUuHKbTk+7lmP1MW6QBqXBgwfLvn37zP2VK1eaKYQrVqxw73Px4kUZNWqULF26VOrXr2+26WiPTql75513zFQ4nYanozwuOlKkoUdD1kMPPeTertPzdEqejvDo+qdWrVrJsmXLpEePHubzBQoUSDa6lJIGLFWoUKFk2/W+63Oe7N69W95++23p37+/CX+///67PPPMMxIWFiZdu3Z17/fll19Khw4d5Ny5c2a07fvvvzdt8tbMmTPNCJmOcAULQhYAAAgapRo0kN+jo+X4H+tl4OQlgW4Ogl1EhEixYlffLzLS87ZrOVYf4wZFRkaaoKOjSToVT/+dMlTs3LnThI7mzZtfMdqTdFrhtGnTzEjY/v37zTov/XzNmjWTHaPT8DRguWiQ0TVQLn369DE323SkTUeyRo0aZe5ru//66y8zMpc0ZDVp0kT++OMPOXbsmBnp0oCo6768WU+lIXPLli3y73//W4IJIQsAAAA3p/79/+/mjcWLxZ90yqAr2GhQSuns2bPm41dffSXFUoS/8PBw81FHvwYMGGCKSehoV65cuWTcuHFXVOfTEa+ktOqeBqBrpeucVHR0tAloLno/ZaBLSvetXLlysm2VKlWSTz/9NNk2LaJxyy23mNvtt98u5cqVM6NROtp3vd577z3Tptq1a0swIWQBAAAAPtayZUsz6qSBRyvhpaThRMOUjlDp1EBPdJqhljbv3bu3e5uuvbJNpyFq0NIphq5QpcUpNMylVvFPaWXBbdu2Jdu2fft2KVWqVJqPpwFQp0teLw2mOlVS17IFG0IWAAAA4GM6fU+ntbn+nZKOSukolRa70NDRsGFDOX36tAlWERERZrqdjvhokYdvv/3WBCGdIqfrnvTf10PXa33++ecmRHmiQbBv374ycuRI85h6/pdfflmKFi1qKhe6NG3aVB544AH3CJ22XUPgqFGjzBTANWvWmMIWelNxcXHmGlj333+/GfXS6YI6qqfX1tJiGS4aNE+cOGE+akVFnVqodOQrZ86c7v3mz58vly9fNmvegg0hCwAAAPADDUtpGTFihFm/pSMzWkRCL8SrJeC1iIR64oknZMOGDeY6URqEOnbsaEa19JpU10PDzdVGwAYOHGhCUc+ePeXUqVMm9C1ZskSyZs3q3kfPoedyue2220x4Gzx4sLz66qsmnGkp+M6dO7vD5datW03FQz1Oy8vrMT///LNZR+aiVRN1HxfXmjStlnjXXXe5t+sUw3bt2nm8YHGghTi6+i4D0aFOLROpF0sLxi8I0g/9K5RedVwXcWbKxNUS4Dv0NaSnvna5cGEJ1cIXEfl8Wvgi9ugB2fvVWzJv1vQ0q6wh8H1Np9hpSXF9w570DT5wrfQCyqn1IQ2QWo1RRw2vFoKvB7+NAQAAAMAiQhYAAAAAWETIAgAAAACLCFkAAAAAYBEhCwAABI3oCRPk6aq3ypQOfQPdFASZDFarDTd53yFkAQCAoHHh9tvl13z5ZXPZqoFuCoJElixZzMdz584Fuim4ScXHx6d6fTJf4TpZAAAACFr6xlgvu6Ml3VX27NnNNaKAa70cQExMjOk3oaH+iz6ELAAAAAS1woULm4+uoAVcD72uX8mSJf0azglZAAAgaGT99Ve5/cRxybfrLzlQsHigm4MgoW+OixQpYi5OfOnSpUA3BzeZsLAwn11APTWELAAAEDQKPfecTI2OluP798nA+i0D3RwE4dRBf66rAbxF4QsAAAAAsIiQBQAAAAAWEbIAAAAAwCJCFgAAAABYRMgCAAAAAIsIWQAAAABgESELAAAAACwiZAEAAACARYQsAAAAALCIkAUAAILGvpUr5bY7m0mvl94LdFMAwGuELAAAAACwiJAFAAAAABYRsgAAAADAIkIWAAAIGnmnTJG+u7bLg9/PD3RTAMBrhCwAABA0IubPl84H90uzNd8HuikA4DVCFgAAAABYRMgCAAAAAIsIWQAAAABgESELAAAAACwiZAEAAACARYQsAAAAALCIkAUAAAAAFhGyAAAAAMAiQhYAAAga5+vVk9V588nmMlUC3RQA8BohCwAABI2jEyfKM9VqyZSO/QLdFADwGiELAAAAACwiZAEAAACARYQsAAAAALCIkAUAAIJG0YcflvlrV8uwGUMD3RQA8Fqo94cCAADYlWXPHilzLk6OxxwKdFMAwGuMZAEAAACARYQsAAAAALCIkAUAAAAAFhGyAAAAAMAiQhYAAAAAWETIAgAAAACLCFkAAAAAYBEhCwAAAAAs4mLEAAAgaJx4+mn5cNZcyV6zaaCbAgBeYyQLAAAEjTMdOshHxUvJ0nr3BLopAOA1QhYAAAAAWETIAgAAAACLCFkAACBoZD56VApevCB5Yk8EuikA4DUKXwAAgKBR/IEH5KvoaDm+ZbMMnLwk0M0BAK8wkgUAAAAAFhGyAAAAAMAiQhYAAAAAWETIAgAAAACLCFkAAAAAYBEhCwAAAADSU8iaNm2aREVFSdasWaVevXqyZs2aNPefNGmSVKhQQbJlyyYlSpSQfv36yYULF/zWXgAAAAAI2pA1f/586d+/vwwbNkzWr18vNWrUkBYtWsjRo0c97v/RRx/JoEGDzP5btmyRmTNnmnO8+OKLfm87AAAAAARdyJo4caL06NFDunXrJpUrV5bp06dL9uzZZdasWR73X7VqlTRo0EA6depkRr/uuece6dix41VHvwAAAADAX0IlQOLj42XdunUyePBg97ZMmTJJs2bNZPXq1R6PueOOO+SDDz4woapu3bqye/du+frrr+WRRx5J9XEuXrxobi6xsbHmY2JiorkBvqL9y3Ec+hl8jr6G9NTXDr7/vrzw0itS6K5OEiKOzx4nRG8hIXzvBCl+rsFffNXHAhayjh07JgkJCVKoUKFk2/X+1q1bPR6jI1h6XMOGDc033uXLl6VXr15pThccPXq0DB8+/IrtMTExJugBvvymPX36tOmr+gcEwFfoa0hPfe1EZKRkqlpFQqKKScEs//sjqW05c4iEli4lZ86cSXWZAgKHn2vwF+1n6SpkeWPFihUyatQoeeutt0yRjJ07d8qzzz4rI0aMkJdfftnjMTpSpuu+ko5kacGMyMhIyZMnjx9bj4z4C0L/Sqp9jV8Q8CX6GtJTXzt79qzs3LNPLlcSicgRLr4SGyeyd88+yZUrlxQsWNBnjwPv8HMN/hIWFpa+QlaBAgUkc+bMEh0dnWy73i9cuLDHYzRI6dTAxx9/3NyvVq2axMXFSc+ePeWll17y+E0YHh5ubinpvnzTwtf0FwR9Df5AX0N66WuuKXw6UdAxk/p8w5zfcdzPB8GHn2vwB1/1r0yBTI21a9eWZcuWJfurhd6vX7++x2POnTt3xQuhQU3+/w9KAABwc8u5eLG0OXxQGm74KdBNAYCbc7qgTuPr2rWr1KlTxxSy0Gtg6ciUVhtUXbp0kWLFipl1Vap169amIuGtt97qni6oo1u63RW2AADAzSv/2LEyJDpajkdHy8AWnQLdHAC4+UJW+/btTQGKoUOHypEjR6RmzZqyZMkSdzGM/fv3Jxu5GjJkiBk61o8HDx4083Q1YL322msBfBYAAAAAEESFL/r06WNuqRW6SCo0NNRciFhvAAAAABCMWEkIAAAAABYRsgAAAADAIkIWAAAAAFhEyAIAAAAAiwhZAAAAAGARIQsAAAAALCJkAQCAoHE5MlKiw8LlVK68gW4KANy818kCAABwObhokXR4rJdEteotEYFuDAB4iZEsAAAAALCIkAUAAAAAFhGyAAAAAMAiQhYAAAgaBYYMkdGbN0rPT98OdFMAwGuELAAAEDRyLF8uzY4dlVpb1wW6KQDgNUIWAAAAAFhEyAIAAAAAiwhZAAAAAGARIQsAAAAALCJkAQAAAIBFhCwAAAAAsIiQBQAAAAAWEbIAAAAAwCJCFgAACBpnW7eWRYWLysqajQLdFADwWqj3hwIAANh1fNAgeW37Xolq1VUiAt0YAPASI1kAAAAAYBEhCwAAAAAsImQBAAAAgEWELAAAEDRK3HOPLF+5XCaNfzrQTQEAr1H4AgAABI1McXGSMyFBLl48H+imAIDXGMkCAAAAAIsIWQAAAABgESELAAAAACwiZAEAAACARYQsAAAAALCIkAUAAAAAFhGyAAAAAMAiQhYAAAAAWMTFiAEAQNCIGTFCJr8xVXLXax3opgCA1xjJAgAAQePc3XfLsshCsr5SnUA3BQC8RsgCAAAAAIsIWQAAAABgESELAAAEjbC//pJqsaekzIFdgW4KAHiNwhcAACBoFHniCZkVHS3Hd++WgbUaB7o5AOAVRrIAAAAAwCJCFgAAAABYRMgCAAAAAIsIWQAAAABgESELAAAAACwiZAEAAACARYQsAAAAALCIkAUAAAAAFhGyAAAAAMAiQhYAAAga+7/9Vu664y7pO2BqoJsCAF4jZAEAgKDh5MwpcaGhciE8W6CbAgBeI2QBAAAAgEWELAAAAACwiJAFAACCRu6ZM6XH3l1y30+LA90UAPAaIQsAAASNPLNmSc/9e+S+nwlZAG5ehCwAAAAAsIiQBQAAAAAWEbIAAAAAwCJCFgAAAABYRMgCAAAAAIsIWQAAAABgESELAAAAACwiZAEAAACARaE2TwYAAHAjLlapIpvPXZD4YmUC3RQA8BohCwAABI0jM2ZI98d6SVSr3hIR6MYAgJeYLggAAAAAFhGyAAAAAMAiQhYAAAAAWETIAgAAQaNwz54yc8Pv8sKcUYFuCgB4jcIXAAAgaIT//bdUP3Najh/cHeimAIDXGMkCAAAAAIsIWQAAAABgESELAAAAACwiZAEAAACARYQsAAAAAEhPIWvatGkSFRUlWbNmlXr16smaNWvS3P/UqVPy1FNPSZEiRSQ8PFzKly8vX3/9td/aCwAAAABBW8J9/vz50r9/f5k+fboJWJMmTZIWLVrItm3bpGDBglfsHx8fL82bNzefW7hwoRQrVkz27dsnefLkCUj7AQAAACCoQtbEiROlR48e0q1bN3Nfw9ZXX30ls2bNkkGDBl2xv24/ceKErFq1SrJkyWK26SgYAAAAAEhGD1k6KrVu3ToZPHiwe1umTJmkWbNmsnr1ao/HLF68WOrXr2+mC37xxRcSGRkpnTp1khdeeEEyZ87s8ZiLFy+am0tsbKz5mJiYaG6Ar2j/chyHfgafo68hPfW1U489Jp9/NF/Cqt4pIeL47HFC9BYSwvdOkOLnGvzFV33Mq5C1e/duKVOmzA098LFjxyQhIUEKFSqUbLve37p1a6qP+8MPP0jnzp3NOqydO3dK79695dKlSzJs2DCPx4wePVqGDx9+xfaYmBgT9ABfftOePn3a/JLQPyAAvkJfQ3rqa4f/9S9ZfjhGCta5Vwpm+d8fSW3LmUMktHQpOXPmjBw9etRnjwPv8HMN/qL9LGhC1i233CKNGzeW7t27y4MPPmiKVvjrG07XY82YMcOMXNWuXVsOHjwo48aNSzVk6UiZrvtKOpJVokQJMwrGWi74ur/qX0m1r/ELAr5EX0N66mtnz56VnXv2yeVKIhE5wsVXYuNE9u7ZJ7ly5fK4DhyBxc81+EtYWFjwhKz169fL7NmzTXjp06ePtG/f3gSuunXrXvM5ChQoYIJSdHR0su16v3Dhwh6P0YqCuhYr6dTASpUqyZEjR8yolKcXSSsQ6i0l/Yblmxa+pr8g6GvwB/oa0ktfc03h04mCjpnU5xvm/I7jfj4IPvxcgz/4qn95ddaaNWvK5MmT5dChQ6YYxeHDh6Vhw4ZStWpVU8xCp+JdjQYiHYlatmxZsr9a6H1dd+VJgwYNzBTBpHMnt2/fbsKXr1IoAADwn5CzZyXH5cuS9eL5QDcFALx2Q9EtNDRU2rVrJwsWLJCxY8eaADRgwAAzHa9Lly4mfKVFR8LeffddmTt3rmzZskWefPJJiYuLc1cb1HMkLYyhn9fqgs8++6wJV1qJcNSoUaYQBgAAuPmVbNFCVqxaIZPGPx3opgBAYKoLrl271oxkzZs3T3LkyGEClk4bPHDggCk20aZNmzQvLqzTDHXUa+jQoWbKn46QLVmyxF0MY//+/cmG8DS8ffvtt9KvXz+pXr26uU6WBi6tLggAAAAAN23I0imBuiZLLxp87733yvvvv28+ugJR6dKlZc6cOdd0DStd06U3T1asWHHFNp1K+Ouvv3rTbAAAAAAIzpD19ttvy2OPPSaPPvqoWQ/liVbqmTlz5o22DwAAAADSf8jasWPHVffRQhRdu3b15vQAAAAAkLEKX+hUQS12kZJu0yIWAAAAAJBReRWyRo8eba5z5WmKoFb7AwAAAICMyquQpVX/tLhFSqVKlTKfAwAAAICMyquQpSNWGzduvGL7n3/+Kfnz57fRLgAAAADIOCGrY8eO8swzz8jy5cslISHB3H744QdzzaoOHTrYbyUAAAAApOfqgiNGjJC9e/dK06ZNJTT0/06RmJgoXbp0YU0WAADw2uF33pFhr4yUyEbtA90UAPBvyNLy7PPnzzdhS6cIZsuWTapVq2bWZAEAAHgrvmpV2RSRR6KKl5WIQDcGAPwZslzKly9vbgAAAACAGwhZugZrzpw5smzZMjl69KiZKpiUrs8CAAAAgIzIq5ClBS40ZLVq1UqqVq0qISEh9lsGAAAynOw//CBNY6Il95a1srNg8UA3BwD8F7LmzZsnn3zyidx7773ePSoAAIAHkS+/LGOio+X4wYMysHHbQDcHAPxXwl0LX9xyyy3ePSIAAAAApGNehaznnntOJk+eLI7j2G8RAAAAAGS06YK//PKLuRDxN998I1WqVJEsWbIk+/xnn31mq30AAAAAkP5DVp48eeSBBx6w3xoAAAAAyIgha/bs2fZbAgAAAAAZdU2Wunz5sixdulTeeecdOXPmjNl26NAhOXv2rM32AQAAAED6H8nat2+ftGzZUvbv3y8XL16U5s2bS65cuWTs2LHm/vTp0+23FAAAAADS60iWXoy4Tp06cvLkScmWLZt7u67TWrZsmc32AQAAAED6H8n6+eefZdWqVeZ6WUlFRUXJwYMHbbUNAABkMIk5csjZzJnlQvj//ogLABkiZCUmJkpCQsIV2w8cOGCmDQIAAHjjv999Jx0e6yVRrXpLRKAbAwD+nC54zz33yKRJk9z3Q0JCTMGLYcOGyb333uttWwAAAAAgY45kTZgwQVq0aCGVK1eWCxcuSKdOnWTHjh1SoEAB+fjjj+23EgAAAADSc8gqXry4/PnnnzJv3jzZuHGjGcXq3r27dO7cOVkhDAAAAADIaEK9PjA0VB5++GG7rQEAABla/jFj5KXtmyWTzJUvur0U6OYAgP9C1vvvv5/m57t06eJdawAAQIaW8z//kbbR0XL83M/yRaAbAwD+DFl6naykLl26JOfOnTMl3bNnz07IAgAAAJBheVVdUC9CnPSma7K2bdsmDRs2pPAFAAAAgAzNq5DlSbly5WTMmDFXjHIBAAAAQEZiLWS5imEcOnTI5ikBAAAAIP2vyVq8eHGy+47jyOHDh+XNN9+UBg0a2GobAAAAAGSMkNW2bdtk90NCQiQyMlLuvvtuc6FiAAAAAMiovApZiYmJ9lsCAAAAAOmA1TVZAAAAAJDReTWS1b9//2ved+LEid48BAAAyIDimjSR35f+IE6ZGoFuCgD4N2Rt2LDB3PQixBUqVDDbtm/fLpkzZ5ZatWolW6sFAABwrY6NHCmDD/WSqFZPSkSgGwMA/gxZrVu3lly5csncuXMlb968ZptelLhbt27SqFEjee6557xtDwAAAABkvDVZWkFw9OjR7oCl9N8jR46kuiAAAACADM2rkBUbGysxMTFXbNdtZ86csdEuAAAAAMg4IeuBBx4wUwM/++wzOXDggLl9+umn0r17d2nXrp39VgIAgAyhWNu28uWvP8uYKc8HuikA4N81WdOnT5cBAwZIp06dTPELc6LQUBOyxo0b531rAABAhhYaEyOF4i9K6JmTgW4KAPg3ZGXPnl3eeustE6h27dpltpUtW1Zy5MjhfUsAAAAAIKNfjPjw4cPmVq5cOROwHMex1zIAAAAAyCgh6/jx49K0aVMpX7683HvvvSZoKZ0uSPl2AAAAABmZVyGrX79+kiVLFtm/f7+ZOujSvn17WbJkic32AQAAAED6X5P13XffybfffivFixdPtl2nDe7bt89W2wAAAAAgY4xkxcXFJRvBcjlx4oSEh4fbaBcAAAAAZJyQ1ahRI3n//ffd90NCQiQxMVFef/11adKkic32AQAAAED6ny6oYUoLX6xdu1bi4+Nl4MCB8vfff5uRrJUrV9pvJQAAAACk55GsqlWryvbt26Vhw4bSpk0bM32wXbt2smHDBnO9LAAAAG8cf+EFGVmuknxwb5dANwUA/DeSdenSJWnZsqVMnz5dXnrpJe8fGQAAIIWz998vXyz6WqJuvVMiAt0YAPDXSJaWbt+4caO3jwcAAAAA6ZpX0wUffvhhmTlzpv3WAAAAAEBGLHxx+fJlmTVrlixdulRq164tOXLkSPb5iRMn2mofAADIQLLs3i1l4s5K4ZiDcrZg8utxAkC6DFm7d++WqKgo+euvv6RWrVpmmxbASErLuQMAAHij6COPyPzoaDm+Y7sMnLwk0M0BAN+HrHLlysnhw4dl+fLl5n779u1lypQpUqhQIe8eHQAAAAAy8posx3GS3f/mm29M+XYAAAAAwA0UvkgtdAEAAABARnddIUvXW6Vcc8UaLAAAAADwck2Wjlw9+uijEh4ebu5fuHBBevXqdUV1wc8+++x6TgsAAAAAGTNkde3a9YrrZQEAAAAAvAxZs2fPvp7dAQAAACDDuaHCFwAAAACA5AhZAAAAABCo6YIAAAC+dODzz+WpfgOlaLNHA90UAPAaI1kAACBoJBQsKEfDs8qpiHyBbgoAeI2QBQAAAAAWEbIAAAAAwCJCFgAACBq55s2TTgf2SbPfvgt0UwDAa4QsAAAQNPJNnSr9du+QB5d+EuimAIDXCFkAAAAAYBEhCwAAAAAsImQBAAAAgEWELAAAAACwiJAFAAAAAOktZE2bNk2ioqIka9asUq9ePVmzZs01HTdv3jwJCQmRtm3b+ryNAAAAAHBThKz58+dL//79ZdiwYbJ+/XqpUaOGtGjRQo4ePZrmcXv37pUBAwZIo0aN/NZWAAAAAAj6kDVx4kTp0aOHdOvWTSpXrizTp0+X7Nmzy6xZs1I9JiEhQTp37izDhw+XMmXK+LW9AAAAAJCWUAmg+Ph4WbdunQwePNi9LVOmTNKsWTNZvXp1qse9+uqrUrBgQenevbv8/PPPaT7GxYsXzc0lNjbWfExMTDQ3wFe0fzmOQz+Dz9HXkJ76Wnzp0rL/zFk5F1lUQsTx2eOE6C0khO+dIMXPNfiLr/pYQEPWsWPHzKhUoUKFkm3X+1u3bvV4zC+//CIzZ86UP/7445oeY/To0WbEK6WYmBgT8gBfftOePn3a/JLQPx4AvkJfQ3rqa4ffekvGTXlLCta5Vwpm+d8fSW3LmUMktHQpOXPmzFWXKMD/+LkGf9F+lu5C1vXSH4SPPPKIvPvuu1KgQIFrOkZHyXTNV9KRrBIlSkhkZKTkyZPHh61FRqe/IPSvpNrX+AUBX6KvIT31tbNnz8rOPfvkciWRiBzh4iuxcSJ79+yTXLlymdkxCC78XIO/hIWFpb+QpUEpc+bMEh0dnWy73i9cuPAV++/atcsUvGjduvUVQ3yhoaGybds2KVu2bLJjwsPDzS0l/Yblmxa+pr8g6GvwB/oa0ktfc03h04mCjpnU5xvm/I7jfj4IPvxcgz/4qn9lCnRyrF27tixbtixZaNL79evXv2L/ihUryqZNm8xUQdft/vvvlyZNmph/6wgVAAAAAARSwKcL6lS+rl27Sp06daRu3boyadIkiYuLM9UGVZcuXaRYsWJmbZVeR6tq1arJjndN+Uu5HQAA3HwK9u8vUzatl8uxb8icZycEujkAcHOGrPbt25siFEOHDpUjR45IzZo1ZcmSJe5iGPv372eYGACADCLbb79J/ZMn5HjC34FuCgDcvCFL9enTx9w8WbFiRZrHzpkzx0etAgAAAIDrxxARAAAAAFhEyAIAAAAAiwhZAAAAAGARIQsAAAAALCJkAQAAAIBFhCwAAAAAsIiQBQAAAADp7TpZAAAAKrZ9e/nm00USWql+oJsCAF4jZAEAgKBx8plnZNIfmyWqeXuJCHRjAMBLTBcEAAAAAIsIWQAAAABgESELAAAAACwiZAEAgKBRqkED+f2npTL9tccD3RQA8BohCwAAAAAsImQBAAAAgEWELAAAAACwiJAFAAAAABYRsgAAAADAIkIWAAAAAFhEyAIAAAAAiwhZAAAAAGARIQsAAAAALCJkAQCAoBE9YYI8XfVWmdKhb6CbAgBeI2QBAICgceH22+XXfPllc9mqgW4KAHiNkAUAAAAAFhGyAAAAAMAiQhYAAAgaWX/9VW4/cVwq7/or0E0BAK+Fen8oAACAXYWee06mRkfL8f37ZGD9loFuDgB4hZEsAAAAALCIkAUAAAAAFhGyAAAAAMAiQhYAAAAAWETIAgAAAACLCFkAAAAAYBEhCwAAAAAsImQBAAAAgEWELAAAAACwiJAFAACCxr6VK+W2O5tJr5feC3RTAMBrhCwAAAAAsIiQBQAAAAAWEbIAAAAAwCJCFgAACBp5p0yRvru2y4Pfzw90UwDAa4QsAAAQNCLmz5fOB/dLszXfB7opAOA1QhYAAAAAWETIAgAAAACLCFkAAAAAYBEhCwAAAAAsImQBAAAAgEWELAAAAACwiJAFAAAAABYRsgAAAADAIkIWAAAIGufr1ZPVefPJ5jJVAt0UAPAaIQsAAASNoxMnyjPVasmUjv0C3RQA8BohCwAAAAAsImQBAAAAgEWELAAAAACwiJAFAACCRtGHH5b5a1fLsBlDA90UAPBaqPeHAgAA2JVlzx4pcy5OjsccCnRTAMBrjGQBAAAAgEWELAAAAACwiJAFAAAAABYRsgAAAADAIkIWAAAAAFhEyAIAAAAAiwhZAAAAAGARIQsAAAAALOJixAAAIGicePpp+XDWXMles2mgmwIAXmMkCwAABI0zHTrIR8VLydJ69wS6KQDgNUIWAAAAAFhEyAIAAAAAiwhZAAAgaGQ+elQKXrwgeWJPBLopAOA1Cl8AAICgUfyBB+Sr6Gg5vmWzDJy8JNDNAQCvMJIFAAAAABYRsgAAAADAIkIWAAAAAFhEyAIAAAAAiwhZAAAAAJDeQta0adMkKipKsmbNKvXq1ZM1a9akuu+7774rjRo1krx585pbs2bN0twfAAAAADJUyJo/f770799fhg0bJuvXr5caNWpIixYt5OjRox73X7FihXTs2FGWL18uq1evlhIlSsg999wjBw8e9HvbAQAAACDoQtbEiROlR48e0q1bN6lcubJMnz5dsmfPLrNmzfK4/4cffii9e/eWmjVrSsWKFeW9996TxMREWbZsmd/bDgAAAABBdTHi+Ph4WbdunQwePNi9LVOmTGYKoI5SXYtz587JpUuXJF++fB4/f/HiRXNziY2NNR81mOkN8BXtX47j0M/gc/Q1pNe+FiKOD88tEhISwvdOkOLnGvzFV30soCHr2LFjkpCQIIUKFUq2Xe9v3br1ms7xwgsvSNGiRU0w82T06NEyfPjwK7bHxMSYkAf48pv29OnT5peE/vEA8BX6GtJTXzvx9tsye+6Hkrf6XVIwy//+SGpbzhwioaVLyZkzZ1JdooDA4eca/EX7WboLWTdqzJgxMm/ePLNOS4tmeKKjZLrmK+lIlq7jioyMlDx58vixtciIvyD0r6Ta1/gFAV+iryE99bWz1arJj6diJSpnMYm4FC6+EhsnsnfPPsmVK5cULFjQZ48D7/BzDf4SFhaW/kJWgQIFJHPmzBIdHZ1su94vXLhwmseOHz/ehKylS5dK9erVU90vPDzc3FLSb1i+aeFr+guCvgZ/oK8hvfQ11xQ+nSjomEl9vmHO7zju54Pgw881+IOv+lemQCfH2rVrJyta4SpiUb9+/VSPe/3112XEiBGyZMkSqVOnjp9aCwAAAABXF/A/DehUPr321dy5c2XLli3y5JNPSlxcnKk2qLp06ZKsMMbYsWPl5ZdfNtUH9dpaR44cMbezZ88G8FkAAAAbci5eLG0OH5SGG34KdFMA4OZdk9W+fXtThGLo0KEmLGlpdh2hchXD2L9/f7JhvLffftsUrHjwwQeTnUevs/XKK6/4vf0AAMCe/GPHypDoaDkeHS0DW3QKdHMA4OYMWapPnz7m5okWtUhq7969fmoVAAAAANyE0wUBAAAAID0hZAEAAACARYQsAAAAALCIkAUAAAAAFhGyAAAAAMAiQhYAAAAAWETIAgAAAACLCFkAACBoXI6MlOiwcDmVK2+gmwIAN/fFiAEAANTBRYukw2O9JKpVb4kIdGMAwEuMZAEAAACARYQsAAAAALCIkAUAAAAAFhGyAABA0CgwZIiM3rxRen76dqCbAgBeI2QBAICgkWP5cml27KjU2rou0E0BAK8RsgAAAADAIkIWAAAAAFhEyAIAAAAAiwhZAAAAAGARIQsAAAAALCJkAQAAAIBFhCwAAAAAsIiQBQAAAAAWEbIAAEDQONu6tSwqXFRW1mwU6KYAgNdCvT8UAADAruODBslr2/dKVKuuEhHoxgCAlxjJAgAAAACLCFkAAAAAYBEhCwAAAAAsImQBAICgUeKee2T5yuUyafzTgW4KAHiNwhcAACBoZIqLk5wJCXLx4vlANwUAvMZIFgAAAABYRMgCAAAAAIsIWQAAAABgESELAAAAACwiZAEAAACARYQsAAAAALCIkAUAAAAAFhGyAAAAAMAiLkYMAACCRsyIETL5jamSu17rQDcFALzGSBYAAAga5+6+W5ZFFpL1leoEuikA4DVCFgAAAABYRMgCAAAAAIsIWQAAIGiE/fWXVIs9JWUO7Ap0UwDAaxS+AAAAQaPIE0/IrOhoOb57twys1TjQzQEArzCSBQAAAAAWEbIAAAAAwCJCFgAAAABYRMgCAAAAAIsIWQAAAABgESELAAAAACwiZAEAAACARYQsAAAAALCIkAUAAAAAFhGyAABA0Nj/7bdy1x13Sd8BUwPdFADwGiELAAAEDSdnTokLDZUL4dkC3RQA8BohCwAAAAAsImQBAAAAgEWELAAAEDRyz5wpPfbukvt+WhzopgCA1whZAAAgaOSZNUt67t8j9/1MyAJw8yJkAQAAAIBFhCwAAAAAsIiQBQAAAAAWEbIAAAAAwCJCFgAAAABYRMgCAAAAAIsIWQAAAABgESELAAAAACwKtXkyAACAG3GxShXZfO6CxBcrE+imAIDXCFkAACBoHJkxQ7o/1kuiWvWWiEA3BgC8xHRBAAAAALCIkAUAAAAAFhGyAAAAAMAiQhYAAAgahXv2lJkbfpcX5owKdFMAwGsUvgAAAEEj/O+/pfqZ03L84O5ANwUAvMZIFgAAAABYRMgCAAAAAIsIWQAAAABgESELAAAAANJbyJo2bZpERUVJ1qxZpV69erJmzZo091+wYIFUrFjR7F+tWjX5+uuv/dZWAAAAAAjqkDV//nzp37+/DBs2TNavXy81atSQFi1ayNGjRz3uv2rVKunYsaN0795dNmzYIG3btjW3v/76y+9tBwAAAICgC1kTJ06UHj16SLdu3aRy5coyffp0yZ49u8yaNcvj/pMnT5aWLVvK888/L5UqVZIRI0ZIrVq15M033/R72wEAAAAgqK6TFR8fL+vWrZPBgwe7t2XKlEmaNWsmq1ev9niMbteRr6R05GvRokUe97948aK5uZw+fdp8PHXqlKVnAVv0a3Ly5ElJT86cOSOHDx8OdDOQAdDXkF76WrFLlySriMQmJMiJ/dt99jhxJ4/KpQsX5O+//5bY2FifPQ68x881+MPZs2fNR8dx0k/IOnbsmCQkJEihQoWSbdf7W7du9XjMkSNHPO6v2z0ZPXq0DB8+/IrtpUuXvqG2AwAAH4o7LTLsYZ8/TJs2v/j8MQAEv+PHj0vu3LnTR8jyBx0lSzrypaMlpUqVkv3791t9IYGU9C+jJUqUkP/+978SERER6OYgHaOvwV/oa/AX+hr8RWe5lSxZUvLly2f1vAENWQUKFJDMmTNLdHR0su16v3Dhwh6P0e3Xs394eLi5paQBi29a+IP2M/oa/IG+Bn+hr8Ff6GvwF12yZPV8EkBhYWFSu3ZtWbZsmXtbYmKiuV+/fn2Px+j2pPur77//PtX9AQAAAMCfAj5dUKfyde3aVerUqSN169aVSZMmSVxcnKk2qLp06SLFihUza6vUs88+K40bN5YJEyZIq1atZN68ebJ27VqZMWNGgJ8JAAAAAARByGrfvr3ExMTI0KFDTfGKmjVrypIlS9zFLXTtVNLhuzvuuEM++ugjGTJkiLz44otSrlw5U1mwatWq1/R4OnVQr8nlaQohYBN9Df5CX4O/0NfgL/Q13Ox9LcSxXa8QAAAAADKwgF+MGAAAAADSE0IWAAAAAFhEyAIAAAAAiwhZAAAAAGBRugxZ06ZNk6ioKMmaNavUq1dP1qxZk+b+CxYskIoVK5r9q1WrJl9//bXf2oqM09feffddadSokeTNm9fcmjVrdtW+CXj7c81FL3MREhIibdu29XkbkTH72qlTp+Spp56SIkWKmOpc5cuX5/cofNLX9DI/FSpUkGzZskmJEiWkX79+cuHCBb+1Fzenn376SVq3bi1FixY1vw+1KvnVrFixQmrVqmV+pt1yyy0yZ86c637cdBey5s+fb669paUY169fLzVq1JAWLVrI0aNHPe6/atUq6dixo3Tv3l02bNhg3ojo7a+//vJ725G++5p+w2pfW758uaxevdr8grjnnnvk4MGDfm87bi7X29dc9u7dKwMGDDDhHvBFX4uPj5fmzZubvrZw4ULZtm2b+YOSXt8SsNnX9PI9gwYNMvtv2bJFZs6cac6hl/MB0qLX39X+paH+WuzZs8dci7dJkybyxx9/SN++feXxxx+Xb7/9Vq6Lk87UrVvXeeqpp9z3ExISnKJFizqjR4/2uP9DDz3ktGrVKtm2evXqOU888YTP24qM1ddSunz5spMrVy5n7ty5PmwlMmpf0/51xx13OO+9957TtWtXp02bNn5qLTJSX3v77bedMmXKOPHx8X5sJTJiX9N977777mTb+vfv7zRo0MDnbUX6ISLO559/nuY+AwcOdKpUqZJsW/v27Z0WLVpc12Olq5Es/YvaunXrzDQsF72Qsd7XkQNPdHvS/ZX+JSW1/QFv+1pK586dk0uXLkm+fPl82FJk1L726quvSsGCBc0oPeCrvrZ48WKpX7++mS5YqFAhqVq1qowaNUoSEhL82HJkhL52xx13mGNcUwp3795tpqXee++9fms3MobVlrJBqKQjx44dMz/Y9Qd9Unp/69atHo85cuSIx/11O2Czr6X0wgsvmPnBKb+RgRvta7/88ouZSqPTHABf9jV9o/vDDz9I586dzRvenTt3Su/evc0fkHRaF2Crr3Xq1Mkc17BhQ52FJZcvX5ZevXoxXRDWpZYNYmNj5fz582ZN4LVIVyNZwM1izJgxpiDB559/bhb8AracOXNGHnnkEbMupkCBAoFuDtK5xMREM2I6Y8YMqV27trRv315eeuklmT59eqCbhnRG1zXrKOlbb71l1nB99tln8tVXX8mIESMC3TQg/Y9k6RuKzJkzS3R0dLLter9w4cIej9Ht17M/4G1fcxk/frwJWUuXLpXq1av7uKXIaH1t165dpgiBVlJK+kZYhYaGmsIEZcuW9UPLkRF+rmlFwSxZspjjXCpVqmT+EqxTwsLCwnzebmSMvvbyyy+bPyBpAQKl1aC1oEHPnj1NsNfphoANqWWDiIiIax7FUumqR+oPc/1L2rJly5K9udD7OmfcE92edH/1/fffp7o/4G1fU6+//rr5q9uSJUukTp06fmotMlJf08tRbNq0yUwVdN3uv/9+d5UkrWoJ2Pq51qBBAzNF0BXk1fbt2034ImDBZl/Tdcwpg5Qr3P9fPQPADmvZwEln5s2b54SHhztz5sxxNm/e7PTs2dPJkyePc+TIEfP5Rx55xBk0aJB7/5UrVzqhoaHO+PHjnS1btjjDhg1zsmTJ4mzatCmAzwLpsa+NGTPGCQsLcxYuXOgcPnzYfTtz5kwAnwXSY19LieqC8FVf279/v6mS2qdPH2fbtm3Ol19+6RQsWNAZOXJkAJ8F0mNf0/dn2tc+/vhjZ/fu3c53333nlC1b1lSJBtKi77M2bNhgbhp9Jk6caP69b98+83ntZ9rfXLR/Zc+e3Xn++edNNpg2bZqTOXNmZ8mSJc71SHchS02dOtUpWbKkeUOrJUJ//fVX9+caN25s3nAk9cknnzjly5c3+2vJxq+++ioArUZ672ulSpUy39wpb/qLA7D9cy0pQhZ82ddWrVplLn2ib5i1nPtrr71mLiEA2Oxrly5dcl555RUTrLJmzeqUKFHC6d27t3Py5MkAtR43i+XLl3t8/+XqX/pR+1vKY2rWrGn6pv5cmz179nU/boj+z9LoGgAAAABkeOlqTRYAAAAABBohCwAAAAAsImQBAAAAgEWELAAAAACwiJAFAAAAABYRsgAAAADAIkIWAAAAAFhEyAIAAAAAiwhZAAAAAGARIQsAYN2jjz4qISEhMmbMmGTbFy1aZLa7rFixwtzXW6ZMmSR37txy6623ysCBA+Xw4cNXnDc2NlZeeuklqVixomTNmlUKFy4szZo1k88++0wcx3Hvt3PnTnnsscekZMmSEh4eLsWKFZOmTZvKhx9+KJcvX0613TExMfLkk0+6j9Pzt2jRQlauXGnttQEApH+hgW4AACB90hA0duxYeeKJJyRv3rxp7rtt2zaJiIgwIWr9+vXy+uuvy8yZM00Iq1atmtnn1KlT0rBhQzl9+rSMHDlSbrvtNgkNDZUff/zRhLK7775b8uTJI2vWrDHBq0qVKjJt2jQTyNTatWvN/apVq0qNGjU8tuOf//ynxMfHy9y5c6VMmTISHR0ty5Ytk+PHj4uv6OOFhYX57PwAAP8LcZL+6Q8AAEsjWRpMdESpdevWJjS5RrIeeOAB96iThqgmTZrIyZMnTUByOX/+vBnRKlCggPzyyy9mW+/eveX999+X7du3S9GiRZM93tmzZ02oy5w5swlX2bNnN2FLR8dS0sdOOprmoiFOw6C2qXHjxqk+N93vhRdeMM9FA98tt9xiRuzuu+8+8/lPP/1Uhg4dap57kSJF5Omnn5bnnnvOfXxUVJR0795dduzYYc7Rrl07mTNnjnmegwcPNmFQn7e+TqNHj5YcOXJ48RUAAAQS0wUBAD6hgWfUqFEydepUOXDgwHUdmy1bNunVq5eZpnf06FFJTEyUefPmSefOna8IWCpnzpxmVOuPP/6QLVu2yIABAzwGLOUpYLnOoTcNPhcvXvS4j7bjH//4h2nXBx98IJs3bzYBS5+rWrdunTz00EPSoUMH2bRpk7zyyivy8ssvmxCV1Pjx481o2oYNG8znd+3aJS1btjQjaRs3bpT58+eb0NWnT5/ret0AAMGB6YIAAJ/R0ZiaNWvKsGHDzPS/6+Ga5rd3717zUUe7XNtSo6NcqkKFCu5tGtJ06p+LjqrpqFhKGtI0DPXo0UOmT58utWrVMiNaGpiqV69u9lm6dKkZIdMgV758ebMt6bknTpxo1n5pcFK6jwaxcePGmdE9F53amHR06/HHHzcBsm/fvuZ+uXLlZMqUKebx3377bTNKBwC4eTCSBQDwKV2XpWucNJhcD9eUQh15upGZ7fnz5zcjXHrTKYm6Bio1OpJ06NAhWbx4sRlZ0qmDGrZcI1F6juLFi7sDVkr6HBs0aJBsm97XqYEJCQnubXXq1Em2z59//mkewzWapjctuKEjZ3v27PH6uQMAAoOQBQDwqTvvvNMEBl1vdD1coUzXMEVGRpqAtHXr1jSP0REgVyENF53Kp+um9KajVVejo0bNmzc3o1GrVq0yI1A6EueaxmhDynVWuqZMC4S4wqDeNHhpOCtbtqyVxwQA+A8hCwDgc7pu6T//+Y+sXr36mvbXwhczZswwAU0Dlq6v0ml7WoJdR5pS0pCipdm1WIZOKdQ1TzoKZEPlypUlLi7O/FunDer6Mte0xJQqVap0Rbl3va8jX651W57oaJlOK3SFwaQ3Kg8CwM2HkAUA8Dktw65rjnSdkSe6burIkSNm5EYLXOgUu2PHjpn1SC6vvfaalChRQurVq2eqDGoo0f1nzZplwpUGLZ1aOHv2bDOSpefQaX+6j+6r66z0OliphR2thqhrpbSghRaf0Gl6CxYsMGu42rRpY/bRNVIa/HRa4ffff2/2+eabb2TJkiXm87rOSku+jxgxwgQxnSb55ptvmkIcadFqhTpqpoUudBRL2/zFF19Q+AIAblZawh0AAJu6du3qtGnTJtm2PXv2OGFhYbq4yr1t+fLl5r7eQkJCnFy5cjk1atRwnn/+eefw4cNXnPfUqVPOoEGDnHLlyplzFSpUyGnWrJnz+eefO4mJie79tm3bZtpQvHhxJzQ01MmdO7dz5513Ou+8845z6dIlj22+cOGCOXetWrXM/tmzZ3cqVKjgDBkyxDl37px7v+PHjzvdunVz8ufP72TNmtWpWrWq8+WXX7o/v3DhQqdy5cpOlixZnJIlSzrjxo1L9jilSpVy3njjjSsef82aNU7z5s2dnDlzOjly5HCqV6/uvPbaa9f8mgMAggfXyQIAAAAAi5guCAAAAAAWEbIAAAAAwCJCFgAAAABYRMgCAAAAAIsIWQAAAABgESELAAAAACwiZAEAAACARYQsAAAAALCIkAUAAAAAFhGyAAAAAMAiQhYAAAAAiD3/D4HEtLd1jnOYAAAAAElFTkSuQmCC", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[np.float64(0.6316740659321393)]\n", - "[0.6666666666666666]\n", - "[0.6]\n" + "ename": "", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31mThe kernel failed to start as the Python Environment 'Python' is no longer available. Consider selecting another kernel or refreshing the list of Python Environments." ] } ], @@ -459,7 +986,7 @@ ], "metadata": { "kernelspec": { - "display_name": "zbench", + "display_name": "Python 3", "language": "python", "name": "python3" }, diff --git a/pyproject.toml b/pyproject.toml index 28ab648..db49d43 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,7 @@ dependencies = [ "matplotlib==3.10.3", "ipykernel==6.30.0", "ipywidgets==8.1.7", + "python-dotenv" ] [tool.setuptools.packages.find] diff --git a/requirements-dev.lock b/requirements-dev.lock new file mode 100644 index 0000000..f4d5edf --- /dev/null +++ b/requirements-dev.lock @@ -0,0 +1,251 @@ +# generated by rye +# use `rye lock` or `rye sync` to update this lockfile +# +# last locked with the following flags: +# pre: false +# features: [] +# all-features: false +# with-sources: false +# generate-hashes: false +# universal: false + +-e file:. +aiohappyeyeballs==2.6.1 + # via aiohttp +aiohttp==3.12.14 + # via fsspec + # via zbench +aiosignal==1.4.0 + # via aiohttp +annotated-types==0.7.0 + # via pydantic +anthropic==0.57.1 + # via zbench +anyio==4.10.0 + # via anthropic + # via httpx + # via openai + # via zeroentropy +asttokens==3.0.0 + # via stack-data +attrs==25.3.0 + # via aiohttp +certifi==2025.8.3 + # via httpcore + # via httpx + # via requests +charset-normalizer==3.4.3 + # via requests +comm==0.2.3 + # via ipykernel + # via ipywidgets +contourpy==1.3.3 + # via matplotlib +cycler==0.12.1 + # via matplotlib +datasets==4.0.0 + # via zbench +debugpy==1.8.16 + # via ipykernel +decorator==5.2.1 + # via ipython +dill==0.3.8 + # via datasets + # via multiprocess +distro==1.9.0 + # via anthropic + # via openai + # via zeroentropy +executing==2.2.0 + # via stack-data +filelock==3.18.0 + # via datasets + # via huggingface-hub +fonttools==4.59.0 + # via matplotlib +frozenlist==1.7.0 + # via aiohttp + # via aiosignal +fsspec==2025.3.0 + # via datasets + # via huggingface-hub +h11==0.16.0 + # via httpcore +hf-xet==1.1.7 + # via huggingface-hub +httpcore==1.0.9 + # via httpx +httpx==0.28.1 + # via anthropic + # via openai + # via zeroentropy +huggingface-hub==0.34.4 + # via datasets +idna==3.10 + # via anyio + # via httpx + # via requests + # via yarl +ipykernel==6.30.0 + # via zbench +ipython==9.4.0 + # via ipykernel + # via ipywidgets +ipython-pygments-lexers==1.1.1 + # via ipython +ipywidgets==8.1.7 + # via zbench +jedi==0.19.2 + # via ipython +jiter==0.10.0 + # via anthropic + # via openai +jupyter-client==8.6.3 + # via ipykernel +jupyter-core==5.8.1 + # via ipykernel + # via jupyter-client +jupyterlab-widgets==3.0.15 + # via ipywidgets +kiwisolver==1.4.9 + # via matplotlib +loguru==0.7.3 + # via zbench +matplotlib==3.10.3 + # via zbench +matplotlib-inline==0.1.7 + # via ipykernel + # via ipython +multidict==6.6.4 + # via aiohttp + # via yarl +multiprocess==0.70.16 + # via datasets +nest-asyncio==1.6.0 + # via ipykernel +numpy==2.3.1 + # via contourpy + # via datasets + # via matplotlib + # via pandas + # via zbench +openai==1.97.0 + # via zbench +openlimit @ git+https://github.com/shobrook/openlimit.git@dbacec38467ab17d99607871ef6742d301c34470 + # via zbench +packaging==25.0 + # via datasets + # via huggingface-hub + # via ipykernel + # via matplotlib +pandas==2.3.1 + # via datasets +parso==0.8.4 + # via jedi +pexpect==4.9.0 + # via ipython +pillow==11.3.0 + # via matplotlib +platformdirs==4.3.8 + # via jupyter-core +prompt-toolkit==3.0.51 + # via ipython +propcache==0.3.2 + # via aiohttp + # via yarl +psutil==7.0.0 + # via ipykernel +ptyprocess==0.7.0 + # via pexpect +pure-eval==0.2.3 + # via stack-data +pyarrow==21.0.0 + # via datasets +pydantic==2.11.7 + # via anthropic + # via openai + # via zbench + # via zeroentropy +pydantic-core==2.33.2 + # via pydantic +pygments==2.19.2 + # via ipython + # via ipython-pygments-lexers +pyparsing==3.2.3 + # via matplotlib +python-dateutil==2.9.0.post0 + # via jupyter-client + # via matplotlib + # via pandas +python-dotenv==1.1.1 + # via zbench +pytz==2025.2 + # via pandas +pyyaml==6.0.2 + # via datasets + # via huggingface-hub +pyzmq==27.0.1 + # via ipykernel + # via jupyter-client +redis==6.2.0 + # via openlimit + # via zbench +regex==2025.7.34 + # via tiktoken +requests==2.32.4 + # via datasets + # via huggingface-hub + # via tiktoken +six==1.17.0 + # via python-dateutil +sniffio==1.3.1 + # via anthropic + # via anyio + # via openai + # via zeroentropy +stack-data==0.6.3 + # via ipython +tiktoken==0.9.0 + # via openlimit + # via zbench +tornado==6.5.2 + # via ipykernel + # via jupyter-client +tqdm==4.67.1 + # via datasets + # via huggingface-hub + # via openai + # via zbench +traitlets==5.14.3 + # via ipykernel + # via ipython + # via ipywidgets + # via jupyter-client + # via jupyter-core + # via matplotlib-inline +typing-extensions==4.14.1 + # via aiosignal + # via anthropic + # via anyio + # via huggingface-hub + # via openai + # via pydantic + # via pydantic-core + # via typing-inspection + # via zeroentropy +typing-inspection==0.4.1 + # via pydantic +tzdata==2025.2 + # via pandas +urllib3==2.5.0 + # via requests +wcwidth==0.2.13 + # via prompt-toolkit +widgetsnbextension==4.0.14 + # via ipywidgets +xxhash==3.5.0 + # via datasets +yarl==1.20.1 + # via aiohttp +zeroentropy==0.1.0a6 + # via zbench diff --git a/requirements.lock b/requirements.lock new file mode 100644 index 0000000..f4d5edf --- /dev/null +++ b/requirements.lock @@ -0,0 +1,251 @@ +# generated by rye +# use `rye lock` or `rye sync` to update this lockfile +# +# last locked with the following flags: +# pre: false +# features: [] +# all-features: false +# with-sources: false +# generate-hashes: false +# universal: false + +-e file:. +aiohappyeyeballs==2.6.1 + # via aiohttp +aiohttp==3.12.14 + # via fsspec + # via zbench +aiosignal==1.4.0 + # via aiohttp +annotated-types==0.7.0 + # via pydantic +anthropic==0.57.1 + # via zbench +anyio==4.10.0 + # via anthropic + # via httpx + # via openai + # via zeroentropy +asttokens==3.0.0 + # via stack-data +attrs==25.3.0 + # via aiohttp +certifi==2025.8.3 + # via httpcore + # via httpx + # via requests +charset-normalizer==3.4.3 + # via requests +comm==0.2.3 + # via ipykernel + # via ipywidgets +contourpy==1.3.3 + # via matplotlib +cycler==0.12.1 + # via matplotlib +datasets==4.0.0 + # via zbench +debugpy==1.8.16 + # via ipykernel +decorator==5.2.1 + # via ipython +dill==0.3.8 + # via datasets + # via multiprocess +distro==1.9.0 + # via anthropic + # via openai + # via zeroentropy +executing==2.2.0 + # via stack-data +filelock==3.18.0 + # via datasets + # via huggingface-hub +fonttools==4.59.0 + # via matplotlib +frozenlist==1.7.0 + # via aiohttp + # via aiosignal +fsspec==2025.3.0 + # via datasets + # via huggingface-hub +h11==0.16.0 + # via httpcore +hf-xet==1.1.7 + # via huggingface-hub +httpcore==1.0.9 + # via httpx +httpx==0.28.1 + # via anthropic + # via openai + # via zeroentropy +huggingface-hub==0.34.4 + # via datasets +idna==3.10 + # via anyio + # via httpx + # via requests + # via yarl +ipykernel==6.30.0 + # via zbench +ipython==9.4.0 + # via ipykernel + # via ipywidgets +ipython-pygments-lexers==1.1.1 + # via ipython +ipywidgets==8.1.7 + # via zbench +jedi==0.19.2 + # via ipython +jiter==0.10.0 + # via anthropic + # via openai +jupyter-client==8.6.3 + # via ipykernel +jupyter-core==5.8.1 + # via ipykernel + # via jupyter-client +jupyterlab-widgets==3.0.15 + # via ipywidgets +kiwisolver==1.4.9 + # via matplotlib +loguru==0.7.3 + # via zbench +matplotlib==3.10.3 + # via zbench +matplotlib-inline==0.1.7 + # via ipykernel + # via ipython +multidict==6.6.4 + # via aiohttp + # via yarl +multiprocess==0.70.16 + # via datasets +nest-asyncio==1.6.0 + # via ipykernel +numpy==2.3.1 + # via contourpy + # via datasets + # via matplotlib + # via pandas + # via zbench +openai==1.97.0 + # via zbench +openlimit @ git+https://github.com/shobrook/openlimit.git@dbacec38467ab17d99607871ef6742d301c34470 + # via zbench +packaging==25.0 + # via datasets + # via huggingface-hub + # via ipykernel + # via matplotlib +pandas==2.3.1 + # via datasets +parso==0.8.4 + # via jedi +pexpect==4.9.0 + # via ipython +pillow==11.3.0 + # via matplotlib +platformdirs==4.3.8 + # via jupyter-core +prompt-toolkit==3.0.51 + # via ipython +propcache==0.3.2 + # via aiohttp + # via yarl +psutil==7.0.0 + # via ipykernel +ptyprocess==0.7.0 + # via pexpect +pure-eval==0.2.3 + # via stack-data +pyarrow==21.0.0 + # via datasets +pydantic==2.11.7 + # via anthropic + # via openai + # via zbench + # via zeroentropy +pydantic-core==2.33.2 + # via pydantic +pygments==2.19.2 + # via ipython + # via ipython-pygments-lexers +pyparsing==3.2.3 + # via matplotlib +python-dateutil==2.9.0.post0 + # via jupyter-client + # via matplotlib + # via pandas +python-dotenv==1.1.1 + # via zbench +pytz==2025.2 + # via pandas +pyyaml==6.0.2 + # via datasets + # via huggingface-hub +pyzmq==27.0.1 + # via ipykernel + # via jupyter-client +redis==6.2.0 + # via openlimit + # via zbench +regex==2025.7.34 + # via tiktoken +requests==2.32.4 + # via datasets + # via huggingface-hub + # via tiktoken +six==1.17.0 + # via python-dateutil +sniffio==1.3.1 + # via anthropic + # via anyio + # via openai + # via zeroentropy +stack-data==0.6.3 + # via ipython +tiktoken==0.9.0 + # via openlimit + # via zbench +tornado==6.5.2 + # via ipykernel + # via jupyter-client +tqdm==4.67.1 + # via datasets + # via huggingface-hub + # via openai + # via zbench +traitlets==5.14.3 + # via ipykernel + # via ipython + # via ipywidgets + # via jupyter-client + # via jupyter-core + # via matplotlib-inline +typing-extensions==4.14.1 + # via aiosignal + # via anthropic + # via anyio + # via huggingface-hub + # via openai + # via pydantic + # via pydantic-core + # via typing-inspection + # via zeroentropy +typing-inspection==0.4.1 + # via pydantic +tzdata==2025.2 + # via pandas +urllib3==2.5.0 + # via requests +wcwidth==0.2.13 + # via prompt-toolkit +widgetsnbextension==4.0.14 + # via ipywidgets +xxhash==3.5.0 + # via datasets +yarl==1.20.1 + # via aiohttp +zeroentropy==0.1.0a6 + # via zbench diff --git a/test.py b/test.py new file mode 100644 index 0000000..d5eecab --- /dev/null +++ b/test.py @@ -0,0 +1,100 @@ +import json + +def extract_and_print_filename_score(json_file): + """ + Extract and print filename and score mappings from JSON file + """ + try: + with open(json_file, 'r', encoding='utf-8') as f: + data = json.load(f) + + # Handle both single object and array of objects + if isinstance(data, list): + records = data + else: + records = [data] + + print("=" * 80) + print("FILENAME AND SCORE MAPPINGS") + print("=" * 80) + + total_mappings = 0 + all_scores = [] + + for record_idx, record in enumerate(records): + try: + # Get query info if available + query_info = record.get("query", {}) + query_id = query_info.get("id", f"Record {record_idx + 1}") + query_text = query_info.get("query", "") + + if query_text: + print(f"\nQuery ID: {query_id}") + print(f"Query: {query_text[:100]}{'...' if len(query_text) > 100 else ''}") + print("-" * 80) + + # Navigate to documents + documents = record.get("documents", []) + + if not documents: + print(f"No documents found in record {record_idx + 1}") + continue + + # Collect all documents with scores for sorting + doc_scores = [] + for doc in documents: + # Get filename from metadata + metadata = doc.get("metadata", {}) + filename = metadata.get("filename") + + # Get score from document level + score = doc.get("score") + + if filename is not None and score is not None: + doc_scores.append((filename, score)) + all_scores.append(score) + + # Sort by score in descending order (highest first) + doc_scores.sort(key=lambda x: x[1], reverse=True) + + # Print sorted documents + for doc_idx, (filename, score) in enumerate(doc_scores, 1): + print(f" Document {doc_idx:2d}: {filename:<40} | Score: {score:.8f}") + total_mappings += 1 + + # Print any documents with missing data + missing_count = len(documents) - len(doc_scores) + if missing_count > 0: + print(f" {missing_count} document(s) with missing filename or score") + + except Exception as e: + print(f"Warning: Error processing record {record_idx + 1}: {e}") + continue + + # Print summary + print("=" * 80) + print(f"SUMMARY: {total_mappings} filename-score mappings found") + + if all_scores: + print(f"Score Range: {min(all_scores):.8f} to {max(all_scores):.8f}") + print(f"Average Score: {sum(all_scores)/len(all_scores):.8f}") + print("=" * 80) + + return total_mappings + + except FileNotFoundError: + print(f"❌ Error: File '{json_file}' not found.") + return 0 + except json.JSONDecodeError as e: + print(f"❌ Error: Invalid JSON format - {e}") + return 0 + except Exception as e: + print(f"❌ Error: {e}") + return 0 + +# Example usage +if __name__ == "__main__": + json_file = "annotated_resumes_array.json" # or your actual JSON file name + + # Extract and print the mappings + extract_and_print_filename_score(json_file) \ No newline at end of file diff --git a/test2.py b/test2.py new file mode 100644 index 0000000..6fb7e87 --- /dev/null +++ b/test2.py @@ -0,0 +1,220 @@ +import asyncio +import json +import statistics +import random +from typing import List, Dict + +import openai +from groq import AsyncGroq +import anthropic +from pydantic import BaseModel +from dotenv import load_dotenv + +load_dotenv() + + +# ---------------- Dataset Schemas ---------------- +class DatasetPairDocument(BaseModel): + document_id: str + metadata: dict + content: str + + +class DatasetPair(BaseModel): + pair_id: str + query_id: str + query: str + document_a: DatasetPairDocument + document_b: DatasetPairDocument + + +class DatasetPairs(BaseModel): + pairs: list[DatasetPair] + + +# ---------------- JSON Loading Function ---------------- +def load_dataset_from_json(json_file_path: str) -> DatasetPairs: + with open(json_file_path, 'r', encoding='utf-8') as f: + data = json.load(f) + + pairs = [] + for pair_data in data['pairs']: + dataset_pair = DatasetPair( + pair_id=pair_data['pair_id'], + query_id=pair_data['query_id'], + query=pair_data['query'], + document_a=DatasetPairDocument( + document_id=pair_data['document_a']['document_id'], + metadata=pair_data['document_a']['metadata'], + content=pair_data['document_a']['content'] + ), + document_b=DatasetPairDocument( + document_id=pair_data['document_b']['document_id'], + metadata=pair_data['document_b']['metadata'], + content=pair_data['document_b']['content'] + ) + ) + pairs.append(dataset_pair) + + return DatasetPairs(pairs=pairs) + + +# ---------------- Prompt Builders ---------------- +def build_prompt_openai_groq(pair: DatasetPair, swap: bool) -> str: + doc_a_filename = pair.document_a.metadata.get('filename', 'Unknown') + doc_b_filename = pair.document_b.metadata.get('filename', 'Unknown') + + doc_a = f"Filename: {doc_a_filename}\nContent: {pair.document_a.content}" + doc_b = f"Filename: {doc_b_filename}\nContent: {pair.document_b.content}" + + if swap: + doc_a, doc_b = doc_b, doc_a + + return ( + "# Task\n" + "You are a job relevance scoring system. You will be given a job description and two candidate resumes " + "(A and B). Your job is to decide which resume is more relevant to the job description.\n\n" + "# Instructions\n" + "- Carefully read the job description and both resumes.\n" + "- Consider skills, experience, and alignment with the job requirements.\n" + "- If Resume A is more relevant, return a negative score.\n" + "- If Resume B is more relevant, return a positive score.\n" + "- If both are equally relevant or equally irrelevant, return 0.\n" + "- Always explain your reasoning first, then state the final numeric score.\n\n" + "# Scoring\n" + "Return a single numeric score **strictly between -1.0 and +1.0 (inclusive)**:\n" + "-1.0 → Resume A is far more relevant\n" + " 0.0 → Both resumes are equally relevant or irrelevant\n" + "+1.0 → Resume B is far more relevant\n\n" + "# Job Description:\n" + f"{pair.query}\n\n" + "# Resume A:\n" + f"{doc_a}\n\n" + "# Resume B:\n" + f"{doc_b}\n" + ) + + +def build_prompt_anthropic(pair: DatasetPair, swap: bool) -> str: + doc_a_filename = pair.document_a.metadata.get('filename', 'Unknown') + doc_b_filename = pair.document_b.metadata.get('filename', 'Unknown') + + doc_a = f"Filename: {doc_a_filename}\nContent: {pair.document_a.content}" + doc_b = f"Filename: {doc_b_filename}\nContent: {pair.document_b.content}" + + if swap: + doc_a, doc_b = doc_b, doc_a + + return ( + "# Task\n" + "You are a relevance scoring system. Given a query and two documents (A and B), " + "decide which one is more relevant to the query. Think carefully first, then conclude.\n\n" + "# Scoring\n" + "Score from -1.0 to 1.0 (negative → A is more relevant; positive → B is more relevant).\n\n" + "# Output Format\n" + "At the very end, your last line should be written in this format:\n" + "\n" + "{your_score:.2f}\n" + "\n\n" + f"# Query:\n\n{pair.query}\n\n# Document A:\n\n{doc_a}\n\n# Document B:\n\n{doc_b}\n" + ) + + +# ---------------- Helpers ---------------- +def summarize_tokens(results: List[Dict[str, int]], label: str): + def pctl(values, p): + return statistics.quantiles(values, n=100)[p-1] if values else 0 + + input_tokens = [r["input"] for r in results] + output_tokens = [r["output"] for r in results] + total_tokens = [r["total"] for r in results] + + print(f"\n--- {label} ---") + for name, values in [("Input", input_tokens), ("Output", output_tokens), ("Total", total_tokens)]: + if values: + print(f"{name} tokens: avg={statistics.mean(values):.2f}, p90={pctl(values,90):.2f}, p99={pctl(values,99):.2f}") + else: + print(f"{name} tokens: no data") + + +# ---------------- Avg Token Functions ---------------- +async def avg_tokens_openai(dataset_pairs: DatasetPairs, model: str = "gpt-4.1-2025-04-14"): + client = openai.AsyncClient() + random.seed("score") + results = [] + + for pair in dataset_pairs.pairs: + swap = random.random() < 0.5 + prompt = build_prompt_openai_groq(pair, swap) + resp = await client.chat.completions.create( + model=model, + messages=[{"role": "user", "content": prompt}], + ) + results.append({ + "input": resp.usage.prompt_tokens, + "output": resp.usage.completion_tokens, + "total": resp.usage.total_tokens, + }) + summarize_tokens(results, "OpenAI") + + +async def avg_tokens_groq(dataset_pairs: DatasetPairs, model: str = "meta-llama/llama-4-scout-17b-16e-instruct"): + client = AsyncGroq() + random.seed("score") + results = [] + + for pair in dataset_pairs.pairs: + swap = random.random() < 0.5 + prompt = build_prompt_openai_groq(pair, swap) + resp = await client.chat.completions.create( + model=model, + messages=[{"role": "user", "content": prompt}], + ) + results.append({ + "input": resp.usage.prompt_tokens, + "output": resp.usage.completion_tokens, + "total": resp.usage.total_tokens, + }) + summarize_tokens(results, "Groq") + + +async def avg_tokens_anthropic(dataset_pairs: DatasetPairs, model: str = "claude-sonnet-4-20250514"): + client = anthropic.AsyncAnthropic() + random.seed("score") + results = [] + + for pair in dataset_pairs.pairs: + swap = random.random() < 0.5 + prompt = build_prompt_anthropic(pair, swap) + resp = await client.messages.create( + model=model, + max_tokens=1024, + messages=[{"role": "user", "content": prompt}], + ) + results.append({ + "input": resp.usage.input_tokens, + "output": resp.usage.output_tokens, + "total": resp.usage.input_tokens + resp.usage.output_tokens, + }) + summarize_tokens(results, "Anthropic") + + +# ---------------- Example ---------------- +async def main(): + json_file_path = "/mnt/c/Users/polasani rohit/OneDrive/Desktop/zbench/data_new_v4/annotation/v4_llm_resume_data/pairs.json" + + dataset_pairs = load_dataset_from_json(json_file_path) + print(f"Loaded {len(dataset_pairs.pairs)} pairs from JSON file") + + # Pick 10 random samples + sampled_pairs = random.sample(dataset_pairs.pairs, k=min(10, len(dataset_pairs.pairs))) + dataset_pairs = DatasetPairs(pairs=sampled_pairs) + print(f"Using {len(dataset_pairs.pairs)} random samples\n") + + await avg_tokens_openai(dataset_pairs) + await avg_tokens_groq(dataset_pairs) + await avg_tokens_anthropic(dataset_pairs) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/zbench/ai.py b/zbench/ai.py index c265376..e4a85f2 100644 --- a/zbench/ai.py +++ b/zbench/ai.py @@ -32,46 +32,43 @@ class AIModel(BaseModel): def ratelimit_tpm(self) -> float: match self.company: case "openai": - # Tier 5 match self.model: case _ if self.model.startswith("gpt-4o-mini"): - return 150_000_000 + return 10_000_000 # From API headers case _ if self.model.startswith("gpt-4o"): - return 30_000_000 + return 5_000_000 # Adjust as per your tier if needed case "gpt-4-turbo": - return 2_000_000 + return 4_000_000 case _: return 1_000_000 case "google": # Tier 2 - return 5_000_000 + return 2_000_000 case "anthropic": - # Tier 4 - return 80_000 + return 40_000 @computed_field @property def ratelimit_rpm(self) -> float: match self.company: case "openai": - # Tier 5 match self.model: case _ if self.model.startswith("gpt-4o-mini"): - return 30_000 + return 10_000 # From API headers case _: - return 10_000 + return 5_000 # Adjust if API shows higher case "google": # Tier 2 - return 1_000 + return 150 case "anthropic": - # Tier 4 return 4_000 class AIMessage(BaseModel): role: Literal["system", "user", "assistant"] content: str -RATE_LIMIT_RATIO = 0.95 +RATE_LIMIT_RATIO = 1.0 # Use full quota + class AIConnection: openai_client: AsyncOpenAI @@ -123,7 +120,7 @@ async def ai_wait_ratelimit( ) self.backoff_semaphores[key] = asyncio.Semaphore(1) # Prevent too many redis connections. - self.redis_semaphores[key] = asyncio.Semaphore(100) + self.redis_semaphores[key] = asyncio.Semaphore(20) if backoff is not None: async with self.backoff_semaphores[key]: await asyncio.sleep(backoff) @@ -223,7 +220,7 @@ async def ai_call[T: str | BaseModel]( await get_ai_connection().ai_wait_ratelimit( model, num_tokens_input, backoff_algo(i - 1) if i > 0 else None ) - + def ai_message_to_openai_message_param( message: AIMessage, ) -> ChatCompletionMessageParam: @@ -239,6 +236,7 @@ def ai_message_to_openai_message_param( "openai": get_ai_connection().openai_client, "google": get_ai_connection().google_client, }[model.company] + if client is None: raise AIValueError(f"{model.company!r} client not configured") if response_format is str: @@ -251,6 +249,7 @@ def ai_message_to_openai_message_param( temperature=temperature, max_tokens=max_tokens, ) + logger.info("TRUESSS") response_content = response.choices[0].message.content assert response_content is not None assert isinstance(response_content, response_format) diff --git a/zbench/annotation.py b/zbench/annotation.py index 7e9c4ef..3f8d84e 100644 --- a/zbench/annotation.py +++ b/zbench/annotation.py @@ -17,7 +17,7 @@ class EnsembleZELOAnnotator: def __init__(self, dataset_path: str, annotated_dataset_path: str, *, cycle_num: int = 4, document_limit: int | None = None): self.initial_path = Path(dataset_path) self.dataset_name = self.initial_path.stem - self.working_dir = Path(f"data/annotation/{self.dataset_name}") + self.working_dir = Path(f"data_new_v5/annotation/{self.dataset_name}") self.initial_dir = Path(self.initial_path).parent self.cycle_num = cycle_num self.document_limit = document_limit @@ -91,7 +91,7 @@ async def step3_score_pairs(self) -> None: score = Score(pairs_path=str(self.pairs_path), scores_path=str(self.ai_scores_path)) pairs = score.load_pairs() - scores = await score.score_pairs(pairs) + scores = await score.score_pairs_batch(pairs) score.save_scores(scores) print(f"Scored {len(scores.scored_pairs)} pairs and saved to {self.ai_scores_path}") @@ -123,7 +123,7 @@ def step4_compose_annotated_dataset(self) -> None: score += 1/3 if scored_pair.anthropic_score.score > 0: score += 1/3 - if scored_pair.gemini_score.score > 0: + if scored_pair.groq_score.score > 0: score += 1/3 w[j,i] += score w[i,j] += 1 - score diff --git a/zbench/batch_score.py b/zbench/batch_score.py new file mode 100644 index 0000000..710c0c9 --- /dev/null +++ b/zbench/batch_score.py @@ -0,0 +1,664 @@ +import asyncio +from dotenv import load_dotenv +from pydantic import BaseModel +import random +import os +from tqdm.asyncio import tqdm_asyncio +import sys +import json +import tempfile +import asyncio +from groq import AsyncGroq +from openai import AsyncOpenAI +from google import genai +from google.genai import types +from anthropic import AsyncAnthropic +from typing import List, Dict +import re +from zbench.utils import ROOT, wrap_sem +from zbench.ai import ai_call, AIModel, AIMessage, AIError +from zbench.common_types import DatasetPairDocument, DatasetPair, DatasetPairs, DatasetPairScore, DatasetPairScoredPair, DatasetPairScoredPairs +from anthropic.types.message_create_params import MessageCreateParamsNonStreaming +from anthropic.types.messages.batch_create_params import Request + +load_dotenv() + +class Score: + def __init__(self, pairs_path:str, scores_path:str)->None: + self.pairs_path = pairs_path + self.scores_path = scores_path + self.sem = asyncio.Semaphore(150) + random.seed("score") + + def load_pairs(self)->DatasetPairs: + with open(self.pairs_path, "r") as f: + pairs = DatasetPairs.model_validate_json(f.read()) + return pairs + + async def score_pair_structured(self,query: str, document_a: str, document_b: str, *, model: AIModel) -> DatasetPairScore: + swap = random.random() < 0.5 + if swap: + document_a, document_b = document_b, document_a + + class RelevanceScore(BaseModel): + thoughts: list[str] + score: float + + try: + response = await ai_call( + model=model, + messages=[ + AIMessage( + role="system", + content=f""" +# Task + +You are a relevance scoring system. Given a query and two documents (A and B), your job is to decide which document is more relevant to the given query. You should think carefully, considering the pros and cons between each document. For your first few sentences, consider the pros and cons of Document A. Then, spend some time thinking about Document B. Then, at the end, compare, and make a decision as to which one is more relevant. Do NOT make a decision in the beginning of your thoughts, stay open-minded until the last 1-2 sentences of your thoughts. + +# Scoring + +The score should range from -1.0 to 1.0, where negative means document A is more relevant, and positive means Document B is more relevant. +You can pick any number from -1.0 to 1.0. + """, + ), + AIMessage( + role="user", + content=f"# Query:\n\n{query}\n\n# Document A:\n\n{document_a}\n\n# Document B:\n\n{document_b}\n\n", + ) + ], + temperature=0, + response_format=RelevanceScore, + ) + except AIError as e: + print("Unknown Exception!", e, file=sys.stderr) + return DatasetPairScore( + thought="", + score=0.0, + ) + thought = "\n".join(response.thoughts) + score = response.score + if swap: + thought = f"(SWAPPED)\n{thought}" + score = -score + + return DatasetPairScore( + thought=thought, + score=score, + ) + + async def score_pair_unstructured(self,query: str, document_a: str, document_b: str, *, model: AIModel) -> DatasetPairScore: + swap = random.random() < 0.5 + if swap: + document_a, document_b = document_b, document_a + + prev_thought = "" + thought = "" + score = 0.0 + try: + messages=[ + AIMessage( + role="system", + content=f""" +# Task + +You are a relevance scoring system. Given a query and two documents (A and B), your job is to decide which document is more relevant to the given query. You should think carefully, considering the pros and cons between each document. For your first few sentences, consider the pros and cons of Document A. Then, spend some time thinking about Document B. Then, at the end, compare, and make a decision as to which one is more relevant. Do NOT make a decision in the beginning of your thoughts, stay open-minded until the last 1-2 sentences of your thoughts. And, for the last 1-2 sentences, make a clear decision as to which document is more relevant. Ensure that by the last sentence of your thoughts that you've make a clear determination as to which document is more relevant, and also how strong that opinion is (e.g. slightly more relevant versus significantly more relevant). + +# Scoring + +The score should range from -1.0 to 1.0, where negative means document A is more relevant, and positive means Document B is more relevant. +You can pick any number from -1.0 to 1.0. + +# Output Format + +At the very end, your last line should be written in this format: + +{{your_score:.2f}} + + +Of course, replacing your_score with a float between -1.0 and 1.0. +Do NOT output a score of 0.0, ensure to focus on which document is superior, and provide a negative or positive float between -1.0 and 1.0. + """, + ), + AIMessage( + role="user", + content=f"# Query:\n\n{query}\n\n# Document A:\n\n{document_a}\n\n# Document B:\n\n{document_b}\n\n", + ) + ] + for retry in range(2): + response = await ai_call( + model=model, + messages=messages, + temperature=0, + ) + messages.append(AIMessage(role="assistant", content=response)) + + re_result = re.search(r'\s*([-+]?\d*\.\d+|\d+)\s*', response) + if re_result: + score = float(re_result.group(1)) + thought = response.rsplit('', 1)[0].strip() + else: + score = 0.0 + thought = response + if score == 0.0 and retry == 0: + prev_thought = thought + messages.append(AIMessage(role="user", content="You responded with a Score of 0.0. Please do NOT do this. You MUST output a score that is either negative, OR positive, but NOT 0.0. Please deeply consider whether or not Document A or Document B is preferrable. If, after thinking deeply, you still aren't sure, then just make your best guess.")) + continue + break + except AIError as e: + print("Unknown Exception!", e, file=sys.stderr) + return DatasetPairScore( + thought="", + score=0.0, + ) + + if prev_thought != "": + thought = f"Round 1: {prev_thought}\n\nRound 2: {thought}" + + if swap: + thought = f"(SWAPPED)\n{thought}" + score = -score + + return DatasetPairScore( + thought=thought, + score=score, + ) + + async def score_pair_ensemble(self,pair: DatasetPair) -> DatasetPairScoredPair: + def format_document(document: DatasetPairDocument) -> str: + return f"Metadata: {document.metadata}\nContent: {document.content}" + query = pair.query + document_a = format_document(pair.document_a) + document_b = format_document(pair.document_b) + openai_score, gemini_score, anthropic_score = await asyncio.gather( + self.score_pair_structured( + query, + document_a, + document_b, + model=AIModel(company="openai", model="gpt-4.1-2025-04-14"), + ), + self.score_pair_structured( + query, + document_a, + document_b, + model=AIModel(company="google", model="gemini-2.5-pro-preview-03-25"), + ), + self.score_pair_unstructured( + query, + document_a, + document_b, + model=AIModel(company="anthropic", model="claude-3-7-sonnet-20250219"), + ) + ) + return DatasetPairScoredPair( + pair=pair, + openai_score=openai_score, + gemini_score=gemini_score, + anthropic_score=anthropic_score, + ) + + + @staticmethod + def build_openai_batch(dataset_pairs: DatasetPairs, model: str = "gpt-4.1-2025-04-14") -> List[str]: + """Build OpenAI batch JSONL lines from DatasetPairs.""" + batch_lines = [] + + for idx, pair in enumerate(dataset_pairs.pairs): + swap = random.random() < 0.5 + doc_a = f"Metadata: {pair.document_a.metadata}\nContent: {pair.document_a.content}" + doc_b = f"Metadata: {pair.document_b.metadata}\nContent: {pair.document_b.content}" + if swap: + doc_a, doc_b = doc_b, doc_a + + system_prompt = """ + # Task + You are a relevance scoring system. Given a query and two documents (A and B), + decide which document is more relevant to the query. + Think carefully about both, then conclude clearly at the end. + + # Scoring + The score should range from -1.0 to 1.0, + where negative means document A is more relevant, + positive means document B is more relevant. + """ + + user_prompt = f"# Query:\n\n{pair.query}\n\n# Document A:\n\n{doc_a}\n\n# Document B:\n\n{doc_b}\n\n" + + batch_entry = { + "custom_id": f"{pair.pair_id}-{idx}", + "method": "POST", + "url": "/v1/chat/completions", + "body": { + "model": model, + "messages": [ + {"role": "system", "content": system_prompt.strip()}, + {"role": "user", "content": user_prompt.strip()}, + ], + "temperature": 0.0, + "max_tokens": 2048 + } + } + batch_lines.append(json.dumps(batch_entry)) + + return batch_lines + @staticmethod + def build_gemini_batch_jsonl(dataset_pairs: DatasetPairs, model: str = "models/gemini-2.5-pro-preview-03-25") -> List[str]: + + batch_lines: List[str] = [] + + for idx, pair in enumerate(dataset_pairs.pairs): + swap = random.random() < 0.5 + doc_a = f"Metadata: {pair.document_a.metadata}\nContent: {pair.document_a.content}" + doc_b = f"Metadata: {pair.document_b.metadata}\nContent: {pair.document_b.content}" + + if swap: + doc_a, doc_b = doc_b, doc_a + + prompt = ( + "# Task\n" + "You are a relevance scoring system. Given a query and two documents (A and B), " + "decide which one is more relevant by carefully comparing them, and conclude at the end.\n\n" + "# Scoring\n" + "Score from -1.0 to 1.0 (negative → A is more relevant; positive → B is more relevant).\n\n" + f"# Query:\n\n{pair.query}\n\n# Document A:\n\n{doc_a}\n\n# Document B:\n\n{doc_b}\n" + ) + + entry = { + "key": f"{pair.pair_id}-{idx}", + "request": { + "model": model, + "contents": [ + {"parts": [{"text": prompt}]} + ] + } + } + + batch_lines.append(json.dumps(entry)) + + return batch_lines + + @staticmethod + def build_groq_batch(dataset_pairs: DatasetPairs, model: str = "meta-llama/llama-4-scout-17b-16e-instruct") -> List[str]: + + batch_lines: List[str] = [] + + for idx, pair in enumerate(dataset_pairs.pairs): + swap = random.random() < 0.5 + doc_a = f"Metadata: {pair.document_a.metadata}\nContent: {pair.document_a.content}" + doc_b = f"Metadata: {pair.document_b.metadata}\nContent: {pair.document_b.content}" + if swap: + doc_a, doc_b = doc_b, doc_a + + prompt = ( + "# Task\n" + "You are a relevance scoring system. Given a query and two documents (A and B), " + "decide which one is more relevant to the query. Think carefully first, then conclude.\n\n" + "# Scoring\n" + "Score from -1.0 to 1.0 (negative → A is more relevant; positive → B is more relevant).\n\n" + f"# Query:\n\n{pair.query}\n\n# Document A:\n\n{doc_a}\n\n# Document B:\n\n{doc_b}\n" + ) + + batch_lines.append({ + "custom_id": f"{pair.pair_id}-{idx}", + "method": "POST", + "url": "/v1/chat/completions", + "body": {"model": model,"messages": [ + { + "role": "user", + "content": prompt + } + ]} + }) + + return batch_lines + + + + + + @staticmethod + def build_anthropic_batch_request( + dataset_pairs: DatasetPairs, + model: str = "claude-3-7-sonnet-20250219", + max_tokens: int = 1024 + ) -> list[Request]: + """Build Anthropic batch requests from DatasetPairs.""" + requests: list[Request] = [] + + for idx, pair in enumerate(dataset_pairs.pairs): + swap = random.random() < 0.5 + doc_a = f"Metadata: {pair.document_a.metadata}\nContent: {pair.document_a.content}" + doc_b = f"Metadata: {pair.document_b.metadata}\nContent: {pair.document_b.content}" + if swap: + doc_a, doc_b = doc_b, doc_a + + prompt = ( + "# Task\n" + "You are a relevance scoring system. Given a query and two documents (A and B), " + "decide which one is more relevant to the query. Think carefully first, then conclude.\n\n" + "# Scoring\n" + "Score from -1.0 to 1.0 (negative → A is more relevant; positive → B is more relevant).\n\n" + f"# Query:\n\n{pair.query}\n\n# Document A:\n\n{doc_a}\n\n# Document B:\n\n{doc_b}\n" + ) + + requests.append( + Request( + custom_id=f"{pair.pair_id}-{idx}", + params=MessageCreateParamsNonStreaming( + model=model, + max_tokens=max_tokens, + messages=[{"role": "user", "content": prompt}] + ) + ) + ) + + return requests + + + async def submit_openai_batch(self, batch_lines: list[str]) -> list[dict]: + """Submit OpenAI batch and poll until complete.""" + client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY")) + + # Write JSONL file for batch input + with tempfile.NamedTemporaryFile("w+", suffix=".jsonl", delete=False) as f: + f.write("\n".join(batch_lines)) + f.flush() + input_file = await client.files.create(file=open(f.name, "rb"), purpose="batch") + + # Create batch job + batch = await client.batches.create( + input_file_id=input_file.id, + endpoint="/v1/chat/completions", + completion_window="24h" + ) + + print(f"[OpenAI Batch] Created job: {batch.id}, status={batch.status}") + + # Poll until complete + while True: + b = await client.batches.retrieve(batch.id) + print(f"[OpenAI Batch] Status: {b.status}") # 👈 live progress log + + if b.status == "completed": + print(f"[OpenAI Batch] Completed! Downloading results...") + output_file = await client.files.content(b.output_file_id) + # Depending on SDK, may be .text or .read().decode("utf-8") + content = getattr(output_file, "text", None) + if content is None: # fallback + content = (await output_file.read()).decode("utf-8") + return [json.loads(line) for line in content.splitlines()] + + elif b.status in ["failed", "expired", "canceled"]: + print(f"[OpenAI Batch] Job ended with status={b.status}") + return [] + + await asyncio.sleep(10) # sleep between polls + + + + + async def submit_groq_batch(self, batch_lines: list[dict]) -> list[dict]: + """Submit Groq batch and return parsed results (using temp JSONL).""" + + client = AsyncGroq(api_key=os.getenv("GROQ_API_KEY")) + + # --- Write input batch to a temp JSONL file --- + with tempfile.NamedTemporaryFile("w+", suffix=".jsonl", delete=False) as f: + for item in batch_lines: + f.write(json.dumps(item) + "\n") + f.flush() + input_path = f.name + + uploaded = await client.files.create(file=open(input_path, "rb"), purpose="batch") + + # --- Create batch job --- + batch = await client.batches.create( + input_file_id=uploaded.id, + endpoint="/v1/chat/completions", + completion_window="24h", + ) + print(f"[Groq Batch] Created job {batch.id}, status={batch.status}") + + # --- Poll until finished --- + while True: + status = await client.batches.retrieve(batch.id) + print(f"[Groq Batch] Status: {status.status}") + if status.status in ["completed", "failed", "cancelled"]: + break + await asyncio.sleep(10) + + if status.status != "completed": + raise RuntimeError(f"❌ Groq batch failed with status: {status.status}") + + # --- Fetch results file (BinaryAPIResponse) --- + output_file = await client.files.content(status.output_file_id) + + # Save to a temp JSONL and re-read (like official example) + with tempfile.NamedTemporaryFile("wb+", suffix=".jsonl", delete=False) as tmp: + await output_file.write_to_file(tmp.name) # SDK helper + tmp.flush() + output_path = tmp.name + + # Read JSONL into memory + with open(output_path, "r", encoding="utf-8") as f: + results = [json.loads(line) for line in f if line.strip()] + + print(f"[Groq Batch] ✅ Retrieved {len(results)} results") + + # Cleanup temp files + try: + os.remove(input_path) + os.remove(output_path) + except Exception as e: + print(f"⚠️ Temp cleanup failed: {e}") + + return results + + + async def submit_gemini_batch(self, batch_lines: list[str], model="models/gemini-2.5-pro-preview-03-25") -> list[dict]: + """Submit Gemini batch JSONL lines and poll until done.""" + + # Write to temp file + with tempfile.NamedTemporaryFile("w+", suffix=".jsonl", delete=False) as f: + f.write("\n".join(batch_lines)) + f.flush() + path = f.name + + client = genai.Client() + uploaded = client.files.upload(file=path,config=types.UploadFileConfig(display_name='my-batch-requests', mime_type='jsonl') +) + batch_job = client.batches.create(model=model, src=uploaded.name, config={'display_name': 'zbench_batch'}) + + # Poll until done + while True: + job = client.batches.get(name=batch_job.name) + if job.state.name == 'JOB_STATE_SUCCEEDED': + output = client.files.download(file=job.dest.file_name).decode('utf-8') + return [json.loads(line) for line in output.splitlines()] + elif job.state.name in ('JOB_STATE_FAILED', 'JOB_STATE_CANCELLED'): + raise RuntimeError(f"Gemini batch job error: {job.state.name}") + await asyncio.sleep(15) + + + async def submit_anthropic_batch(self, requests: list[Request]) -> list[dict]: + """Submit Anthropic batch and poll for results.""" + + client = AsyncAnthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) + + batch = await client.messages.batches.create(requests=requests) + print(f"[Anthropic Batch] Created job: {batch.id}, status={batch.processing_status}") + + while True: + b = await client.messages.batches.retrieve(batch.id) + print(f"[Anthropic Batch] Status: {b.processing_status}") + + if b.processing_status == "ended": + print("[Anthropic Batch] Completed! Downloading results...") + try: + # Use the built-in results method to stream results + results = [] + r = await client.messages.batches.results(batch.id) + async for result in r: + # Convert to the format expected by your code + result_dict = { + "custom_id": result.custom_id, + "result": { + "type": result.result.type, + } + } + + if result.result.type == "succeeded": + result_dict["result"]["message"] = { + "content": [{"text": result.result.message.content[0].text}] + } + elif result.result.type == "errored": + result_dict["result"]["error"] = { + "type": result.result.error.type, + "message": getattr(result.result.error, 'message', str(result.result.error)) + } + + results.append(result_dict) + + return results + + except Exception as e: + print(f"[Anthropic Batch] Error downloading results: {e}") + return [] + + elif b.processing_status in ["failed", "expired", "canceled"]: + print(f"[Anthropic Batch] Job ended with status={b.processing_status}") + return [] + + await asyncio.sleep(10) # poll every 10s + + + async def score_pairs_batch(self, pairs: DatasetPairs) -> DatasetPairScoredPairs: + """Score pairs in batch using OpenAI, Gemini, and Anthropic.""" + if not pairs.pairs: + return DatasetPairScoredPairs(scored_pairs=[]) + + # Build batch payloads + openai_batch = self.build_openai_batch(pairs) + groq_batch = self.build_groq_batch(pairs) + anthropic_batch = self.build_anthropic_batch_request(pairs) + + # Submit batches concurrently + import time + + start_time = time.time() + print(f"Submitting batch pairs to OpenAI, Groq, and Anthropic at {start_time}...") + openai_results,groq_results,anthropic_results = await asyncio.gather( + self.submit_openai_batch(openai_batch), + self.submit_groq_batch(groq_batch), + self.submit_anthropic_batch(anthropic_batch), + ) + end_time = time.time() + print(f"Batch scoring completed in {end_time - start_time:.2f} seconds") + print(groq_results[0]) + groq_map = {r["custom_id"]: r for r in groq_results} + openai_map = {r["custom_id"]: r for r in openai_results} + + if anthropic_results == []: + anthropic_map = {} + else: + anthropic_map = {r["custom_id"]: r for r in anthropic_results} + + scored_pairs: list[DatasetPairScoredPair] = [] + + for idx, pair in enumerate(pairs.pairs): + cid = f"{pair.pair_id}-{idx}" + + o_raw = openai_map[cid]["response"]["body"]["choices"][0]["message"]["content"] + o_score_match = re.search(r'(-?\d+(\.\d+)?)', o_raw) + openai_score = DatasetPairScore( + thought=o_raw, + score=float(o_score_match.group(1)) if o_score_match else 0.0, + ) + # openai_score = DatasetPairScore( + # thought="(OpenAI skipped due to quota)", + # score=0.0, + # ) + + # --- Gemini --- + # g_raw = gemini_map[cid]["response"]["candidates"][0]["content"]["parts"][0]["text"] + # g_score_match = re.search(r'(-?\d+(\.\d+)?)', g_raw) + # gemini_score = DatasetPairScore( + # thought=g_raw, + # score=float(g_score_match.group(1)) if g_score_match else 0.0, + # ) + # gemini_score = DatasetPairScore( + # thought="(Gemini skipped due to quota)", + # score=0.0, + # ) + if cid in groq_map: + gq_raw = groq_map[cid]["response"]["body"]["choices"][0]["message"]["content"] + gq_score_match = re.search(r'(-?\d+(\.\d+)?)', gq_raw) + groq_score = DatasetPairScore( + thought=gq_raw, + score=float(gq_score_match.group(1)) if gq_score_match else 0.0, + ) + else: + groq_score = DatasetPairScore( + thought="(Groq returned no result)", + score=0.0, + ) + + if cid in anthropic_map: + anthropic_result = anthropic_map[cid] + + if anthropic_result["result"]["type"] == "succeeded": + a_raw = anthropic_result["result"]["message"]["content"][0]["text"] + + + a_score_match = re.search(r'\s*([-+]?\d*\.\d+|\d+)\s*', a_raw) + if not a_score_match: + # Fallback: any number that looks like a score + a_score_match = re.search(r'(-?\d+(\.\d+)?)', a_raw) + + anthropic_score = DatasetPairScore( + thought=a_raw, + score=float(a_score_match.group(1)) if a_score_match else 0.0, + ) + + else: + # Handle errored, canceled, or expired results + error_type = anthropic_result["result"]["type"] + error_msg = anthropic_result["result"].get("error", {}).get("message", "") + anthropic_score = DatasetPairScore( + thought=f"(Anthropic request {error_type}: {error_msg})", + score=0.0, + ) + else: + anthropic_score = DatasetPairScore( + thought="(Anthropic returned no result)", + score=0.0, + ) + + # anthropic_score = DatasetPairScore( + # thought="(Anthropic returned no result)", + # score=0.0, + # ) + scored_pairs.append( + DatasetPairScoredPair( + pair=pair, + openai_score=openai_score, + groq_score=groq_score, + anthropic_score=anthropic_score, + ) + ) + + return DatasetPairScoredPairs(scored_pairs=scored_pairs) + + + async def score_pairs(self, pairs: DatasetPairs) -> DatasetPairScoredPairs: + scored_pairs = await tqdm_asyncio.gather( + *[ + wrap_sem(self.score_pair_ensemble(pair), self.sem) + for pair in pairs.pairs + ], + desc="Scoring Pairs", + ) + return DatasetPairScoredPairs(scored_pairs=scored_pairs) + + def save_scores(self, scores: DatasetPairScoredPairs) -> None: + with open(self.scores_path, "w") as f: + f.write(scores.model_dump_json(indent=4)) \ No newline at end of file diff --git a/zbench/common_types.py b/zbench/common_types.py index 447b0b0..3380abb 100644 --- a/zbench/common_types.py +++ b/zbench/common_types.py @@ -66,7 +66,7 @@ class ModelScore(BaseModel): class DatasetPairScoredPair(BaseModel): pair: DatasetPair openai_score: DatasetPairScore - gemini_score: DatasetPairScore + groq_score: DatasetPairScore anthropic_score: DatasetPairScore diff --git a/zbench/score.py b/zbench/score.py index cab4707..992f2b5 100644 --- a/zbench/score.py +++ b/zbench/score.py @@ -2,12 +2,26 @@ from dotenv import load_dotenv from pydantic import BaseModel import random +import os from tqdm.asyncio import tqdm_asyncio import sys +import json +import tempfile +import asyncio +from groq import AsyncGroq +from openai import AsyncOpenAI +from google import genai +from google.genai import types +from anthropic import AsyncAnthropic +from typing import List, Dict import re +import tiktoken from zbench.utils import ROOT, wrap_sem from zbench.ai import ai_call, AIModel, AIMessage, AIError from zbench.common_types import DatasetPairDocument, DatasetPair, DatasetPairs, DatasetPairScore, DatasetPairScoredPair, DatasetPairScoredPairs +from anthropic.types.message_create_params import MessageCreateParamsNonStreaming +from anthropic.types.messages.batch_create_params import Request +from transformers import AutoTokenizer load_dotenv() @@ -184,6 +198,596 @@ def format_document(document: DatasetPairDocument) -> str: anthropic_score=anthropic_score, ) + + @staticmethod + def build_openai_batch(dataset_pairs: DatasetPairs, model: str = "gpt-4.1-2025-04-14") -> tuple[List[str], dict[str, bool], float]: + """Build OpenAI batch JSONL lines with swap tracking.""" + batch_lines = [] + swap_tracking = {} + + random.seed("score") + + for idx, pair in enumerate(dataset_pairs.pairs): + swap = random.random() < 0.5 + swap_tracking[f"{pair.pair_id}-{idx}"] = swap + + doc_a = f"Metadata: {pair.document_a.metadata}\nContent: {pair.document_a.content}" + doc_b = f"Metadata: {pair.document_b.metadata}\nContent: {pair.document_b.content}" + + if swap: + doc_a, doc_b = doc_b, doc_a + + system_prompt = """ + You are a relevance scoring system. You will be given a Query and two Documents (A and B). + +## Instructions: +1. **Evaluate Document A**: + - Note points that make it relevant to the Query. + - Note points that make it less relevant. +2. **Evaluate Document B**: + - Note points that make it relevant to the Query. + - Note points that make it less relevant. +3. **Compare and decide**: + - Only in the last 1–2 sentences choose which is more relevant overall. + - Stay open-minded until the comparison step. + +## Scoring: +- Output a single numeric score between **-1.0** and **1.0**: + - Negative → Document A is more relevant. + - Positive → Document B is more relevant. + - 0.0 → Equally relevant. +- The **magnitude** reflects confidence (closer to ±1.0 = more confident). + +## Output format: +Reasoning: [Your comparative analysis] +Score: [Your numeric score] + """ + + user_prompt = f"# Query:\n\n{pair.query}\n\n# Document A:\n\n{doc_a}\n\n# Document B:\n\n{doc_b}\n\n" + + batch_entry = { + "custom_id": f"{pair.pair_id}-{idx}", + "method": "POST", + "url": "/v1/chat/completions", + "body": { + "model": model, + "messages": [ + {"role": "system", "content": system_prompt.strip()}, + {"role": "user", "content": user_prompt.strip()}, + ], + "temperature": 0.0, + "max_tokens": 2048, + "response_format": { + "type": "json_schema", + "json_schema": { + "name": "RelevanceScore", + "schema": { + "type": "object", + "properties": { + "thoughts": { + "type": "array", + "items": {"type": "string"}, + "description": "List of thoughts analyzing each document" + }, + "score": { + "type": "number", + "minimum": -1.0, + "maximum": 1.0, + "description": "Relevance score from -1.0 to 1.0" + } + }, + "required": ["thoughts", "score"], + "additionalProperties": False + } + } + } + } + } + + + batch_lines.append(json.dumps(batch_entry)) + + return batch_lines, swap_tracking + + # Updated Groq batch method with response format + @staticmethod + def build_groq_batch( + dataset_pairs: DatasetPairs, + model: str = "meta-llama/llama-4-scout-17b-16e-instruct" + ) -> tuple[List[dict], dict[str, bool], float]: + batch_lines = [] + swap_tracking = {} + + random.seed("score") + + for idx, pair in enumerate(dataset_pairs.pairs): + swap = random.random() < 0.5 + swap_tracking[f"{pair.pair_id}-{idx}"] = swap + + doc_a = f"Metadata: {pair.document_a.metadata}\nContent: {pair.document_a.content}" + doc_b = f"Metadata: {pair.document_b.metadata}\nContent: {pair.document_b.content}" + + if swap: + doc_a, doc_b = doc_b, doc_a + + system_prompt = """You are a relevance scoring system. You will be given a Query and two Documents (A and B). + + ## Instructions: + 1. **Evaluate Document A**: + - Note points that make it relevant to the Query. + - Note points that make it less relevant. + 2. **Evaluate Document B**: + - Note points that make it relevant to the Query. + - Note points that make it less relevant. + 3. **Compare and decide**: + - Only in the last 1–2 sentences choose which is more relevant overall. + - Stay open-minded until the comparison step. + + ## Scoring: + - Output a single numeric score between **-1.0** and **1.0**: + - Negative → Document A is more relevant. + - Positive → Document B is more relevant. + - 0.0 → Equally relevant. + - The **magnitude** reflects confidence (closer to ±1.0 = more confident). + + ## Output format: + Reasoning: [Your comparative analysis] + Score: [Your numeric score] + + """ + + user_prompt = f"# Query:\n\n{pair.query}\n\n# Document A:\n\n{doc_a}\n\n# Document B:\n\n{doc_b}\n\n" + + body = { + "model": model, + "messages": [{"role": "system", "content": system_prompt.strip()}, + {"role": "user", "content": user_prompt.strip()}], + "response_format": { + "type": "json_schema", + "json_schema": { + "name": "RelevanceScore", + "schema": { + "type": "object", + "properties": { + "thoughts": { + "type": "array", + "items": {"type": "string"} + }, + "score": { + "type": "number", + "minimum": -1.0, + "maximum": 1.0 + } + }, + "required": ["thoughts", "score"] + } + } + } + } + + batch_lines.append({ + "custom_id": f"{pair.pair_id}-{idx}", + "method": "POST", + "url": "/v1/chat/completions", + "body": body + }) + + + return batch_lines, swap_tracking + + + @staticmethod + def build_gemini_batch_jsonl(dataset_pairs: DatasetPairs, model: str = "models/gemini-2.5-pro-preview-03-25") -> List[str]: + batch_lines: List[str] = [] + + for idx, pair in enumerate(dataset_pairs.pairs): + swap = random.random() < 0.5 + doc_a = f"Metadata: {pair.document_a.metadata}\nContent: {pair.document_a.content}" + doc_b = f"Metadata: {pair.document_b.metadata}\nContent: {pair.document_b.content}" + + if swap: + doc_a, doc_b = doc_b, doc_a + + prompt = ( + "# Task\n" + "You are a job relevance scoring system. You will be given a job description and two candidate resumes " + "(A and B). Your job is to decide which resume is more relevant to the job description.\n\n" + "# Instructions\n" + "- Carefully read the job description and both resumes.\n" + "- Consider skills, experience, and alignment with the job requirements.\n" + "- If Resume A is more relevant, return a negative score.\n" + "- If Resume B is more relevant, return a positive score.\n" + "- If both are equally relevant or equally irrelevant, return 0.\n" + "- Always explain your reasoning first, then state the final numeric score.\n\n" + "# Scoring\n" + "Return a single numeric score **strictly between -1.0 and +1.0 (inclusive)**:\n" + "-1.0 → Resume A is far more relevant\n" + " 0.0 → Both resumes are equally relevant or irrelevant\n" + "+1.0 → Resume B is far more relevant\n\n" + "Intermediate values (e.g., -0.7, -0.3, 0.2, 0.6, 0.9) are encouraged " + "to capture nuanced differences in relevance.\n\n" + "# Job Description:\n" + f"{pair.query}\n\n" + "# Resume A:\n" + f"{doc_a}\n\n" + "# Resume B:\n" + f"{doc_b}\n" + ) + + + entry = { + "key": f"{pair.pair_id}-{idx}", + "request": { + "model": model, + "contents": [ + {"parts": [{"text": prompt}]} + ], + "generationConfig": { + "responseMimeType": "application/json", + "responseSchema": { + "type": "object", + "properties": { + "thoughts": { + "type": "array", + "items": {"type": "string"} + }, + "score": { + "type": "number", + "minimum": -1.0, + "maximum": 1.0 + } + }, + "required": ["thoughts", "score"] + } + } + } + } + + batch_lines.append(json.dumps(entry)) + + return batch_lines + + + + + + @staticmethod + def build_anthropic_batch_request( + dataset_pairs: DatasetPairs, + model: str = "claude-sonnet-4-20250514", + max_tokens: int = 1024 + ) -> tuple[list[Request], dict[str, bool]]: + """Build Anthropic batch requests with swap tracking.""" + requests = [] + swap_tracking = {} + + random.seed("score") + + for idx, pair in enumerate(dataset_pairs.pairs): + swap = random.random() < 0.5 + swap_tracking[f"{pair.pair_id}-{idx}"] = swap + + doc_a = f"Metadata: {pair.document_a.metadata}\nContent: {pair.document_a.content}" + doc_b = f"Metadata: {pair.document_b.metadata}\nContent: {pair.document_b.content}" + + if swap: + doc_a, doc_b = doc_b, doc_a + + prompt = ( + "# Task\n" + "You are a relevance scoring system. Given a query and two documents (A and B), " + "decide which one is more relevant to the query. Think carefully first, then conclude.\n\n" + "# Scoring\n" + "Score from -1.0 to 1.0 (negative → A is more relevant; positive → B is more relevant).\n\n" + "# Output Format\n" + "At the very end, your last line should be written in this format:\n" + "\n" + "{your_score:.2f}\n" + "\n\n" + f"# Query:\n\n{pair.query}\n\n# Document A:\n\n{doc_a}\n\n# Document B:\n\n{doc_b}\n" + ) + + requests.append( + Request( + custom_id=f"{pair.pair_id}-{idx}", + params=MessageCreateParamsNonStreaming( + model=model, + max_tokens=max_tokens, + messages=[{"role": "user", "content": prompt}] + ) + ) + ) + + return requests, swap_tracking + + + async def submit_openai_batch(self, batch_lines: list[str]) -> list[dict]: + """Submit OpenAI batch and poll until complete.""" + client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY")) + + # Write JSONL file for batch input + with tempfile.NamedTemporaryFile("w+", suffix=".jsonl", delete=False) as f: + f.write("\n".join(batch_lines)) + f.flush() + input_file = await client.files.create(file=open(f.name, "rb"), purpose="batch") + + # Create batch job + batch = await client.batches.create( + input_file_id=input_file.id, + endpoint="/v1/chat/completions", + completion_window="24h" + ) + + print(f"[OpenAI Batch] Created job: {batch.id}, status={batch.status}") + + # Poll until complete + while True: + b = await client.batches.retrieve(batch.id) + print(f"[OpenAI Batch] Status: {b.status}") # 👈 live progress log + + if b.status == "completed": + print(f"[OpenAI Batch] Completed! Downloading results...") + output_file = await client.files.content(b.output_file_id) + # Depending on SDK, may be .text or .read().decode("utf-8") + content = getattr(output_file, "text", None) + if content is None: # fallback + content = (await output_file.read()).decode("utf-8") + return [json.loads(line) for line in content.splitlines()] + + elif b.status in ["failed", "expired", "canceled"]: + print(f"[OpenAI Batch] Job ended with status={b.status}") + return [] + + await asyncio.sleep(10) # sleep between polls + + + + + async def submit_groq_batch(self, batch_lines: list[dict]) -> list[dict]: + """Submit Groq batch and return parsed results (using temp JSONL).""" + + client = AsyncGroq(api_key=os.getenv("GROQ_API_KEY")) + + # --- Write input batch to a temp JSONL file --- + with tempfile.NamedTemporaryFile("w+", suffix=".jsonl", delete=False) as f: + for item in batch_lines: + f.write(json.dumps(item) + "\n") + f.flush() + input_path = f.name + + uploaded = await client.files.create(file=open(input_path, "rb"), purpose="batch") + + # --- Create batch job --- + batch = await client.batches.create( + input_file_id=uploaded.id, + endpoint="/v1/chat/completions", + completion_window="24h", + ) + print(f"[Groq Batch] Created job {batch.id}, status={batch.status}") + + # --- Poll until finished --- + while True: + status = await client.batches.retrieve(batch.id) + print(f"[Groq Batch] Status: {status.status}") + if status.status in ["completed", "failed", "cancelled"]: + break + await asyncio.sleep(10) + + if status.status != "completed": + raise RuntimeError(f"❌ Groq batch failed with status: {status.status}") + + # --- Fetch results file (BinaryAPIResponse) --- + output_file = await client.files.content(status.output_file_id) + + # Save to a temp JSONL and re-read (like official example) + with tempfile.NamedTemporaryFile("wb+", suffix=".jsonl", delete=False) as tmp: + await output_file.write_to_file(tmp.name) # SDK helper + tmp.flush() + output_path = tmp.name + + # Read JSONL into memory + with open(output_path, "r", encoding="utf-8") as f: + results = [json.loads(line) for line in f if line.strip()] + + print(f"[Groq Batch] ✅ Retrieved {len(results)} results") + + # Cleanup temp files + try: + os.remove(input_path) + os.remove(output_path) + except Exception as e: + print(f"⚠️ Temp cleanup failed: {e}") + + return results + + + async def submit_gemini_batch(self, batch_lines: list[str], model="models/gemini-2.5-pro-preview-03-25") -> list[dict]: + """Submit Gemini batch JSONL lines and poll until done.""" + + # Write to temp file + with tempfile.NamedTemporaryFile("w+", suffix=".jsonl", delete=False) as f: + f.write("\n".join(batch_lines)) + f.flush() + path = f.name + + client = genai.Client() + uploaded = client.files.upload(file=path,config=types.UploadFileConfig(display_name='my-batch-requests', mime_type='jsonl') +) + batch_job = client.batches.create(model=model, src=uploaded.name, config={'display_name': 'zbench_batch'}) + + # Poll until done + while True: + job = client.batches.get(name=batch_job.name) + if job.state.name == 'JOB_STATE_SUCCEEDED': + output = client.files.download(file=job.dest.file_name).decode('utf-8') + return [json.loads(line) for line in output.splitlines()] + elif job.state.name in ('JOB_STATE_FAILED', 'JOB_STATE_CANCELLED'): + raise RuntimeError(f"Gemini batch job error: {job.state.name}") + await asyncio.sleep(15) + + + async def submit_anthropic_batch(self, requests: list[Request]) -> list[dict]: + """Submit Anthropic batch and poll for results.""" + + client = AsyncAnthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) + + batch = await client.messages.batches.create(requests=requests) + print(f"[Anthropic Batch] Created job: {batch.id}, status={batch.processing_status}") + + while True: + b = await client.messages.batches.retrieve(batch.id) + print(f"[Anthropic Batch] Status: {b.processing_status}") + + if b.processing_status == "ended": + print("[Anthropic Batch] Completed! Downloading results...") + try: + # Use the built-in results method to stream results + results = [] + r = await client.messages.batches.results(batch.id) + async for result in r: + # Convert to the format expected by your code + result_dict = { + "custom_id": result.custom_id, + "result": { + "type": result.result.type, + } + } + + if result.result.type == "succeeded": + result_dict["result"]["message"] = { + "content": [{"text": result.result.message.content[0].text}] + } + elif result.result.type == "errored": + result_dict["result"]["error"] = { + "type": result.result.error.type, + "message": getattr(result.result.error, 'message', str(result.result.error)) + } + + results.append(result_dict) + + return results + + except Exception as e: + print(f"[Anthropic Batch] Error downloading results: {e}") + return [] + + elif b.processing_status in ["failed", "expired", "canceled"]: + print(f"[Anthropic Batch] Job ended with status={b.processing_status}") + return [] + + await asyncio.sleep(10) # poll every 10s + + + async def score_pairs_batch(self, pairs: DatasetPairs) -> DatasetPairScoredPairs: + """Score pairs in batch using OpenAI, Groq, and Anthropic with proper swap handling.""" + if not pairs.pairs: + return DatasetPairScoredPairs(scored_pairs=[]) + + random.seed("score") + # Build batch payloads with swap tracking + openai_batch, openai_swaps = self.build_openai_batch(pairs) + groq_batch, groq_swaps = self.build_groq_batch(pairs) + anthropic_batch, anthropic_swaps = self.build_anthropic_batch_request(pairs) + + # Submit batches concurrently + import time + + start_time = time.time() + print(f"Submitting batch pairs to OpenAI, Groq, and Anthropic at {start_time}...") + openai_results, groq_results, anthropic_results = await asyncio.gather( + self.submit_openai_batch(openai_batch), + self.submit_groq_batch(groq_batch), + self.submit_anthropic_batch(anthropic_batch), + ) + end_time = time.time() + print(f"Batch scoring completed in {end_time - start_time:.2f} seconds") + + groq_map = {r["custom_id"]: r for r in groq_results} + openai_map = {r["custom_id"]: r for r in openai_results} + anthropic_map = {r["custom_id"]: r for r in anthropic_results} if anthropic_results else {} + + scored_pairs: list[DatasetPairScoredPair] = [] + + for idx, pair in enumerate(pairs.pairs): + cid = f"{pair.pair_id}-{idx}" + + # --- OpenAI (structured response with swap correction) --- + try: + o_raw = openai_map[cid]["response"]["body"]["choices"][0]["message"]["content"] + o_data = json.loads(o_raw) + thought = "\n".join(o_data["thoughts"]) + score = float(o_data["score"]) + + # Apply swap correction + if openai_swaps.get(cid, False): + thought = f"(SWAPPED)\n{thought}" + score = -score + + openai_score = DatasetPairScore(thought=thought, score=score) + except (KeyError, json.JSONDecodeError, ValueError) as e: + print(f"Error parsing OpenAI result for {cid}: {e}") + openai_score = DatasetPairScore(thought="(OpenAI parsing error)", score=0.0) + + # --- Groq (structured response with swap correction) --- + if cid in groq_map: + try: + gq_raw = groq_map[cid]["response"]["body"]["choices"][0]["message"]["content"] + gq_data = json.loads(gq_raw) + thought = "\n".join(gq_data["thoughts"]) + score = float(gq_data["score"]) + + # Apply swap correction + if groq_swaps.get(cid, False): + thought = f"(SWAPPED)\n{thought}" + score = -score + + groq_score = DatasetPairScore(thought=thought, score=score) + except (KeyError, json.JSONDecodeError, ValueError) as e: + print(f"Error parsing Groq result for {cid}: {e}") + groq_score = DatasetPairScore(thought="(Groq parsing error)", score=0.0) + else: + groq_score = DatasetPairScore(thought="(Groq returned no result)", score=0.0) + + # --- Anthropic (unstructured with swap correction) --- + if cid in anthropic_map: + anthropic_result = anthropic_map[cid] + if anthropic_result["result"]["type"] == "succeeded": + a_raw = anthropic_result["result"]["message"]["content"][0]["text"] + a_score_match = re.search(r'\s*([-+]?\d*\.\d+|\d+)\s*', a_raw) + if not a_score_match: + a_score_match = re.search(r'(-?\d+(\.\d+)?)', a_raw) + + thought = a_raw + score = float(a_score_match.group(1)) if a_score_match else 0.0 + + # Apply swap correction + if anthropic_swaps.get(cid, False): + thought = f"(SWAPPED)\n{thought}" + score = -score + + anthropic_score = DatasetPairScore(thought=thought, score=score) + else: + error_type = anthropic_result["result"]["type"] + error_msg = anthropic_result["result"].get("error", {}).get("message", "") + anthropic_score = DatasetPairScore( + thought=f"(Anthropic request {error_type}: {error_msg})", + score=0.0, + ) + else: + anthropic_score = DatasetPairScore(thought="(Anthropic returned no result)", score=0.0) + + scored_pairs.append( + DatasetPairScoredPair( + pair=pair, + openai_score=openai_score, + groq_score=groq_score, + anthropic_score=anthropic_score, + ) + ) + + return DatasetPairScoredPairs(scored_pairs=scored_pairs) + + async def score_pairs(self, pairs: DatasetPairs) -> DatasetPairScoredPairs: scored_pairs = await tqdm_asyncio.gather( *[ @@ -196,4 +800,4 @@ async def score_pairs(self, pairs: DatasetPairs) -> DatasetPairScoredPairs: def save_scores(self, scores: DatasetPairScoredPairs) -> None: with open(self.scores_path, "w") as f: - f.write(scores.model_dump_json(indent=4)) + f.write(scores.model_dump_json(indent=4)) \ No newline at end of file