Skip to content

Commit

Permalink
Version0 of anserini notebooks
Browse files Browse the repository at this point in the history
  • Loading branch information
mam10eks committed Jul 21, 2023
1 parent e8dde41 commit d59ee45
Show file tree
Hide file tree
Showing 15 changed files with 1,966 additions and 0 deletions.
19 changes: 19 additions & 0 deletions tira-ir-starters/pyserini/Dockerfile.base
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
FROM pytorch/pytorch:1.12.0-cuda11.3-cudnn8-runtime

RUN apt-get update \
&& apt-get install -y git openjdk-11-jdk build-essential

RUN pip3 install pyserini pandas jupyterlab runnb

RUN pip3 install tira==0.0.22

ENV PYTHONPATH=/workspace

RUN jupyter trust /workspace/*.ipynb

RUN pip3 install faiss-cpu

ADD *.ipynb /workspace/

RUN jupyter trust /workspace/*.ipynb

3 changes: 3 additions & 0 deletions tira-ir-starters/pyserini/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
jupyter:
docker run --rm -ti -p 8888:8888 -v ${HOME}/.tira:/root/.tira -v ${HOME}/.tira:/home/jovyan/.tira -v "${PWD}":/workspace webis/tira-ir-baselines-pyserini:0.0.1-base jupyter notebook --allow-root --ip 0.0.0.0

203 changes: 203 additions & 0 deletions tira-ir-starters/pyserini/full-rank-bm25-rm3.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "70566b0d",
"metadata": {},
"source": [
"# BM25+RM3 with PySerini"
]
},
{
"cell_type": "markdown",
"id": "ba44b2e5",
"metadata": {},
"source": [
"### Step 1: Import everything and load variables"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "056a33fe",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/conda/lib/python3.7/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"I will use a small hardcoded example located in ./sample-input-full-rank.\n",
"The output directory is /tmp/\n"
]
}
],
"source": [
"from pyserini.search.lucene import LuceneSearcher\n",
"import pandas as pd\n",
"from tira.third_party_integrations import get_input_directory_and_output_directory, persist_and_normalize_run\n",
"import json\n",
"from tqdm import tqdm\n",
"\n",
"input_directory, output_directory = get_input_directory_and_output_directory('./sample-input-full-rank')"
]
},
{
"cell_type": "markdown",
"id": "963a9a84",
"metadata": {},
"source": [
"### Step 2: Create Index and Searcher"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "bd993ec8",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"5it [00:00, 5370.43it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"WARNING: sun.reflect.Reflection.getCallerClass is not supported. This will impact performance.\n",
"2023-07-21 07:31:17,518 INFO [main] index.IndexCollection (IndexCollection.java:250) - Setting log level to INFO\n",
"2023-07-21 07:31:17,519 INFO [main] index.IndexCollection (IndexCollection.java:253) - Starting indexer...\n",
"2023-07-21 07:31:17,519 INFO [main] index.IndexCollection (IndexCollection.java:254) - ============ Loading Parameters ============\n",
"2023-07-21 07:31:17,520 INFO [main] index.IndexCollection (IndexCollection.java:255) - DocumentCollection path: /tmp/anserini-docs\n",
"2023-07-21 07:31:17,520 INFO [main] index.IndexCollection (IndexCollection.java:256) - CollectionClass: JsonCollection\n",
"2023-07-21 07:31:17,520 INFO [main] index.IndexCollection (IndexCollection.java:257) - Generator: DefaultLuceneDocumentGenerator\n",
"2023-07-21 07:31:17,520 INFO [main] index.IndexCollection (IndexCollection.java:258) - Threads: 1\n",
"2023-07-21 07:31:17,521 INFO [main] index.IndexCollection (IndexCollection.java:259) - Language: en\n",
"2023-07-21 07:31:17,521 INFO [main] index.IndexCollection (IndexCollection.java:260) - Stemmer: porter\n",
"2023-07-21 07:31:17,521 INFO [main] index.IndexCollection (IndexCollection.java:261) - Keep stopwords? false\n",
"2023-07-21 07:31:17,521 INFO [main] index.IndexCollection (IndexCollection.java:262) - Stopwords: null\n",
"2023-07-21 07:31:17,521 INFO [main] index.IndexCollection (IndexCollection.java:263) - Store positions? true\n",
"2023-07-21 07:31:17,522 INFO [main] index.IndexCollection (IndexCollection.java:264) - Store docvectors? true\n",
"2023-07-21 07:31:17,522 INFO [main] index.IndexCollection (IndexCollection.java:265) - Store document \"contents\" field? false\n",
"2023-07-21 07:31:17,522 INFO [main] index.IndexCollection (IndexCollection.java:266) - Store document \"raw\" field? false\n",
"2023-07-21 07:31:17,523 INFO [main] index.IndexCollection (IndexCollection.java:267) - Additional fields to index: []\n",
"2023-07-21 07:31:17,523 INFO [main] index.IndexCollection (IndexCollection.java:268) - Optimize (merge segments)? false\n",
"2023-07-21 07:31:17,523 INFO [main] index.IndexCollection (IndexCollection.java:269) - Whitelist: null\n",
"2023-07-21 07:31:17,524 INFO [main] index.IndexCollection (IndexCollection.java:270) - Pretokenized?: false\n",
"2023-07-21 07:31:17,524 INFO [main] index.IndexCollection (IndexCollection.java:271) - Index path: /tmp/index\n",
"2023-07-21 07:31:17,527 INFO [main] index.IndexCollection (IndexCollection.java:309) - ============ Indexing Collection ============\n",
"2023-07-21 07:31:17,819 INFO [main] index.IndexCollection (IndexCollection.java:424) - Thread pool with 1 threads initialized.\n",
"2023-07-21 07:31:17,819 INFO [main] index.IndexCollection (IndexCollection.java:426) - Initializing collection in /tmp/anserini-docs\n",
"2023-07-21 07:31:17,821 INFO [main] index.IndexCollection (IndexCollection.java:435) - 1 file found\n",
"2023-07-21 07:31:17,821 INFO [main] index.IndexCollection (IndexCollection.java:436) - Starting to index...\n",
"2023-07-21 07:31:17,974 DEBUG [pool-2-thread-1] index.IndexCollection$LocalIndexerThread (IndexCollection.java:215) - anserini-docs/part-01.json: 5 docs added.\n",
"2023-07-21 07:31:18,227 INFO [main] index.IndexCollection (IndexCollection.java:492) - Indexing Complete! 5 documents indexed\n",
"2023-07-21 07:31:18,228 INFO [main] index.IndexCollection (IndexCollection.java:493) - ============ Final Counter Values ============\n",
"2023-07-21 07:31:18,229 INFO [main] index.IndexCollection (IndexCollection.java:494) - indexed: 5\n",
"2023-07-21 07:31:18,230 INFO [main] index.IndexCollection (IndexCollection.java:495) - unindexable: 0\n",
"2023-07-21 07:31:18,230 INFO [main] index.IndexCollection (IndexCollection.java:496) - empty: 0\n",
"2023-07-21 07:31:18,231 INFO [main] index.IndexCollection (IndexCollection.java:497) - skipped: 0\n",
"2023-07-21 07:31:18,232 INFO [main] index.IndexCollection (IndexCollection.java:498) - errors: 0\n",
"2023-07-21 07:31:18,250 INFO [main] index.IndexCollection (IndexCollection.java:501) - Total 5 documents indexed in 00:00:00\n"
]
}
],
"source": [
"!mkdir -p /tmp/anserini-docs\n",
"\n",
"with open(f'{input_directory}/documents.jsonl') as documents, open(f'/tmp/anserini-docs/part-01.json', 'w') as ans:\n",
" for doc in tqdm(documents):\n",
" doc = json.loads(doc)\n",
" ans.write(json.dumps({\"id\": doc['docno'], \"contents\": doc['text']}) + '\\n')\n",
"\n",
"!python -m pyserini.index.lucene \\\n",
" --collection JsonCollection \\\n",
" --input /tmp/anserini-docs \\\n",
" --index /tmp/index \\\n",
" --generator DefaultLuceneDocumentGenerator \\\n",
" --threads 1 \\\n",
" --storePositions --storeDocvectors\n",
"\n",
"searcher = LuceneSearcher('/tmp/index')\n",
"searcher.set_bm25()\n",
"searcher.set_rm3()"
]
},
{
"cell_type": "markdown",
"id": "25653b1a",
"metadata": {},
"source": [
"### Step 3: Create Run"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "7ad73a92",
"metadata": {},
"outputs": [],
"source": [
"run = []\n",
"\n",
"with open(f'{input_directory}/queries.jsonl') as queries:\n",
" for query in queries:\n",
" query = json.loads(query)\n",
" for doc in searcher.search(query['query'], 1000):\n",
" run += [{\"qid\": query['qid'], \"score\": doc.score, \"docno\": doc.docid}]\n",
"run = pd.DataFrame(run)"
]
},
{
"cell_type": "markdown",
"id": "4d828d78",
"metadata": {},
"source": [
"### Step 4: Persist Run"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "6524fc70",
"metadata": {},
"outputs": [],
"source": [
"persist_and_normalize_run(run, output_file=output_directory, system_name='BM25+RM3', depth=1000)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit d59ee45

Please sign in to comment.