Version0 of anserini notebooks

tira-io · Jul 21, 2023 · d59ee45 · d59ee45
1 parent e8dde41
commit d59ee45
Show file tree

Hide file tree

Showing 15 changed files with 1,966 additions and 0 deletions.
diff --git a/tira-ir-starters/pyserini/Dockerfile.base b/tira-ir-starters/pyserini/Dockerfile.base
@@ -0,0 +1,19 @@
+FROM pytorch/pytorch:1.12.0-cuda11.3-cudnn8-runtime
+
+RUN apt-get update \
+	&& apt-get install -y git openjdk-11-jdk build-essential
+
+RUN pip3 install pyserini pandas jupyterlab runnb
+
+RUN pip3 install tira==0.0.22
+
+ENV PYTHONPATH=/workspace
+
+RUN jupyter trust /workspace/*.ipynb
+
+RUN pip3 install faiss-cpu
+
+ADD *.ipynb /workspace/
+
+RUN jupyter trust /workspace/*.ipynb 
+
diff --git a/tira-ir-starters/pyserini/Makefile b/tira-ir-starters/pyserini/Makefile
@@ -0,0 +1,3 @@
+jupyter:
+	docker run --rm -ti -p 8888:8888 -v ${HOME}/.tira:/root/.tira -v ${HOME}/.tira:/home/jovyan/.tira -v "${PWD}":/workspace webis/tira-ir-baselines-pyserini:0.0.1-base jupyter notebook --allow-root --ip 0.0.0.0
+
diff --git a/tira-ir-starters/pyserini/full-rank-bm25-rm3.ipynb b/tira-ir-starters/pyserini/full-rank-bm25-rm3.ipynb
@@ -0,0 +1,203 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "70566b0d",
+   "metadata": {},
+   "source": [
+    "# BM25+RM3 with PySerini"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ba44b2e5",
+   "metadata": {},
+   "source": [
+    "### Step 1: Import everything and load variables"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "056a33fe",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/lib/python3.7/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "I will use a small hardcoded example located in ./sample-input-full-rank.\n",
+      "The output directory is /tmp/\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pyserini.search.lucene import LuceneSearcher\n",
+    "import pandas as pd\n",
+    "from tira.third_party_integrations import get_input_directory_and_output_directory, persist_and_normalize_run\n",
+    "import json\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "input_directory, output_directory = get_input_directory_and_output_directory('./sample-input-full-rank')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "963a9a84",
+   "metadata": {},
+   "source": [
+    "### Step 2: Create Index and Searcher"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "bd993ec8",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "5it [00:00, 5370.43it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING: sun.reflect.Reflection.getCallerClass is not supported. This will impact performance.\n",
+      "2023-07-21 07:31:17,518 INFO  [main] index.IndexCollection (IndexCollection.java:250) - Setting log level to INFO\n",
+      "2023-07-21 07:31:17,519 INFO  [main] index.IndexCollection (IndexCollection.java:253) - Starting indexer...\n",
+      "2023-07-21 07:31:17,519 INFO  [main] index.IndexCollection (IndexCollection.java:254) - ============ Loading Parameters ============\n",
+      "2023-07-21 07:31:17,520 INFO  [main] index.IndexCollection (IndexCollection.java:255) - DocumentCollection path: /tmp/anserini-docs\n",
+      "2023-07-21 07:31:17,520 INFO  [main] index.IndexCollection (IndexCollection.java:256) - CollectionClass: JsonCollection\n",
+      "2023-07-21 07:31:17,520 INFO  [main] index.IndexCollection (IndexCollection.java:257) - Generator: DefaultLuceneDocumentGenerator\n",
+      "2023-07-21 07:31:17,520 INFO  [main] index.IndexCollection (IndexCollection.java:258) - Threads: 1\n",
+      "2023-07-21 07:31:17,521 INFO  [main] index.IndexCollection (IndexCollection.java:259) - Language: en\n",
+      "2023-07-21 07:31:17,521 INFO  [main] index.IndexCollection (IndexCollection.java:260) - Stemmer: porter\n",
+      "2023-07-21 07:31:17,521 INFO  [main] index.IndexCollection (IndexCollection.java:261) - Keep stopwords? false\n",
+      "2023-07-21 07:31:17,521 INFO  [main] index.IndexCollection (IndexCollection.java:262) - Stopwords: null\n",
+      "2023-07-21 07:31:17,521 INFO  [main] index.IndexCollection (IndexCollection.java:263) - Store positions? true\n",
+      "2023-07-21 07:31:17,522 INFO  [main] index.IndexCollection (IndexCollection.java:264) - Store docvectors? true\n",
+      "2023-07-21 07:31:17,522 INFO  [main] index.IndexCollection (IndexCollection.java:265) - Store document \"contents\" field? false\n",
+      "2023-07-21 07:31:17,522 INFO  [main] index.IndexCollection (IndexCollection.java:266) - Store document \"raw\" field? false\n",
+      "2023-07-21 07:31:17,523 INFO  [main] index.IndexCollection (IndexCollection.java:267) - Additional fields to index: []\n",
+      "2023-07-21 07:31:17,523 INFO  [main] index.IndexCollection (IndexCollection.java:268) - Optimize (merge segments)? false\n",
+      "2023-07-21 07:31:17,523 INFO  [main] index.IndexCollection (IndexCollection.java:269) - Whitelist: null\n",
+      "2023-07-21 07:31:17,524 INFO  [main] index.IndexCollection (IndexCollection.java:270) - Pretokenized?: false\n",
+      "2023-07-21 07:31:17,524 INFO  [main] index.IndexCollection (IndexCollection.java:271) - Index path: /tmp/index\n",
+      "2023-07-21 07:31:17,527 INFO  [main] index.IndexCollection (IndexCollection.java:309) - ============ Indexing Collection ============\n",
+      "2023-07-21 07:31:17,819 INFO  [main] index.IndexCollection (IndexCollection.java:424) - Thread pool with 1 threads initialized.\n",
+      "2023-07-21 07:31:17,819 INFO  [main] index.IndexCollection (IndexCollection.java:426) - Initializing collection in /tmp/anserini-docs\n",
+      "2023-07-21 07:31:17,821 INFO  [main] index.IndexCollection (IndexCollection.java:435) - 1 file found\n",
+      "2023-07-21 07:31:17,821 INFO  [main] index.IndexCollection (IndexCollection.java:436) - Starting to index...\n",
+      "2023-07-21 07:31:17,974 DEBUG [pool-2-thread-1] index.IndexCollection$LocalIndexerThread (IndexCollection.java:215) - anserini-docs/part-01.json: 5 docs added.\n",
+      "2023-07-21 07:31:18,227 INFO  [main] index.IndexCollection (IndexCollection.java:492) - Indexing Complete! 5 documents indexed\n",
+      "2023-07-21 07:31:18,228 INFO  [main] index.IndexCollection (IndexCollection.java:493) - ============ Final Counter Values ============\n",
+      "2023-07-21 07:31:18,229 INFO  [main] index.IndexCollection (IndexCollection.java:494) - indexed:                5\n",
+      "2023-07-21 07:31:18,230 INFO  [main] index.IndexCollection (IndexCollection.java:495) - unindexable:            0\n",
+      "2023-07-21 07:31:18,230 INFO  [main] index.IndexCollection (IndexCollection.java:496) - empty:                  0\n",
+      "2023-07-21 07:31:18,231 INFO  [main] index.IndexCollection (IndexCollection.java:497) - skipped:                0\n",
+      "2023-07-21 07:31:18,232 INFO  [main] index.IndexCollection (IndexCollection.java:498) - errors:                 0\n",
+      "2023-07-21 07:31:18,250 INFO  [main] index.IndexCollection (IndexCollection.java:501) - Total 5 documents indexed in 00:00:00\n"
+     ]
+    }
+   ],
+   "source": [
+    "!mkdir -p /tmp/anserini-docs\n",
+    "\n",
+    "with open(f'{input_directory}/documents.jsonl') as documents, open(f'/tmp/anserini-docs/part-01.json', 'w') as ans:\n",
+    "    for doc in tqdm(documents):\n",
+    "        doc = json.loads(doc)\n",
+    "        ans.write(json.dumps({\"id\": doc['docno'], \"contents\": doc['text']}) + '\\n')\n",
+    "\n",
+    "!python -m pyserini.index.lucene \\\n",
+    "  --collection JsonCollection \\\n",
+    "  --input /tmp/anserini-docs \\\n",
+    "  --index /tmp/index \\\n",
+    "  --generator DefaultLuceneDocumentGenerator \\\n",
+    "  --threads 1 \\\n",
+    "  --storePositions --storeDocvectors\n",
+    "\n",
+    "searcher = LuceneSearcher('/tmp/index')\n",
+    "searcher.set_bm25()\n",
+    "searcher.set_rm3()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "25653b1a",
+   "metadata": {},
+   "source": [
+    "### Step 3: Create Run"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "7ad73a92",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run = []\n",
+    "\n",
+    "with open(f'{input_directory}/queries.jsonl') as queries:\n",
+    "    for query in queries:\n",
+    "        query = json.loads(query)\n",
+    "        for doc in searcher.search(query['query'], 1000):\n",
+    "            run += [{\"qid\": query['qid'], \"score\": doc.score, \"docno\": doc.docid}]\n",
+    "run = pd.DataFrame(run)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4d828d78",
+   "metadata": {},
+   "source": [
+    "### Step 4: Persist Run"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "6524fc70",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "persist_and_normalize_run(run, output_file=output_directory, system_name='BM25+RM3', depth=1000)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		jupyter:
		docker run --rm -ti -p 8888:8888 -v ${HOME}/.tira:/root/.tira -v ${HOME}/.tira:/home/jovyan/.tira -v "${PWD}":/workspace webis/tira-ir-baselines-pyserini:0.0.1-base jupyter notebook --allow-root --ip 0.0.0.0