-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
15 changed files
with
1,966 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
FROM pytorch/pytorch:1.12.0-cuda11.3-cudnn8-runtime | ||
|
||
RUN apt-get update \ | ||
&& apt-get install -y git openjdk-11-jdk build-essential | ||
|
||
RUN pip3 install pyserini pandas jupyterlab runnb | ||
|
||
RUN pip3 install tira==0.0.22 | ||
|
||
ENV PYTHONPATH=/workspace | ||
|
||
RUN jupyter trust /workspace/*.ipynb | ||
|
||
RUN pip3 install faiss-cpu | ||
|
||
ADD *.ipynb /workspace/ | ||
|
||
RUN jupyter trust /workspace/*.ipynb | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
jupyter: | ||
docker run --rm -ti -p 8888:8888 -v ${HOME}/.tira:/root/.tira -v ${HOME}/.tira:/home/jovyan/.tira -v "${PWD}":/workspace webis/tira-ir-baselines-pyserini:0.0.1-base jupyter notebook --allow-root --ip 0.0.0.0 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,203 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "70566b0d", | ||
"metadata": {}, | ||
"source": [ | ||
"# BM25+RM3 with PySerini" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "ba44b2e5", | ||
"metadata": {}, | ||
"source": [ | ||
"### Step 1: Import everything and load variables" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"id": "056a33fe", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"/opt/conda/lib/python3.7/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", | ||
" from .autonotebook import tqdm as notebook_tqdm\n" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"I will use a small hardcoded example located in ./sample-input-full-rank.\n", | ||
"The output directory is /tmp/\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from pyserini.search.lucene import LuceneSearcher\n", | ||
"import pandas as pd\n", | ||
"from tira.third_party_integrations import get_input_directory_and_output_directory, persist_and_normalize_run\n", | ||
"import json\n", | ||
"from tqdm import tqdm\n", | ||
"\n", | ||
"input_directory, output_directory = get_input_directory_and_output_directory('./sample-input-full-rank')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "963a9a84", | ||
"metadata": {}, | ||
"source": [ | ||
"### Step 2: Create Index and Searcher" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"id": "bd993ec8", | ||
"metadata": { | ||
"scrolled": true | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"5it [00:00, 5370.43it/s]\n" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"WARNING: sun.reflect.Reflection.getCallerClass is not supported. This will impact performance.\n", | ||
"2023-07-21 07:31:17,518 INFO [main] index.IndexCollection (IndexCollection.java:250) - Setting log level to INFO\n", | ||
"2023-07-21 07:31:17,519 INFO [main] index.IndexCollection (IndexCollection.java:253) - Starting indexer...\n", | ||
"2023-07-21 07:31:17,519 INFO [main] index.IndexCollection (IndexCollection.java:254) - ============ Loading Parameters ============\n", | ||
"2023-07-21 07:31:17,520 INFO [main] index.IndexCollection (IndexCollection.java:255) - DocumentCollection path: /tmp/anserini-docs\n", | ||
"2023-07-21 07:31:17,520 INFO [main] index.IndexCollection (IndexCollection.java:256) - CollectionClass: JsonCollection\n", | ||
"2023-07-21 07:31:17,520 INFO [main] index.IndexCollection (IndexCollection.java:257) - Generator: DefaultLuceneDocumentGenerator\n", | ||
"2023-07-21 07:31:17,520 INFO [main] index.IndexCollection (IndexCollection.java:258) - Threads: 1\n", | ||
"2023-07-21 07:31:17,521 INFO [main] index.IndexCollection (IndexCollection.java:259) - Language: en\n", | ||
"2023-07-21 07:31:17,521 INFO [main] index.IndexCollection (IndexCollection.java:260) - Stemmer: porter\n", | ||
"2023-07-21 07:31:17,521 INFO [main] index.IndexCollection (IndexCollection.java:261) - Keep stopwords? false\n", | ||
"2023-07-21 07:31:17,521 INFO [main] index.IndexCollection (IndexCollection.java:262) - Stopwords: null\n", | ||
"2023-07-21 07:31:17,521 INFO [main] index.IndexCollection (IndexCollection.java:263) - Store positions? true\n", | ||
"2023-07-21 07:31:17,522 INFO [main] index.IndexCollection (IndexCollection.java:264) - Store docvectors? true\n", | ||
"2023-07-21 07:31:17,522 INFO [main] index.IndexCollection (IndexCollection.java:265) - Store document \"contents\" field? false\n", | ||
"2023-07-21 07:31:17,522 INFO [main] index.IndexCollection (IndexCollection.java:266) - Store document \"raw\" field? false\n", | ||
"2023-07-21 07:31:17,523 INFO [main] index.IndexCollection (IndexCollection.java:267) - Additional fields to index: []\n", | ||
"2023-07-21 07:31:17,523 INFO [main] index.IndexCollection (IndexCollection.java:268) - Optimize (merge segments)? false\n", | ||
"2023-07-21 07:31:17,523 INFO [main] index.IndexCollection (IndexCollection.java:269) - Whitelist: null\n", | ||
"2023-07-21 07:31:17,524 INFO [main] index.IndexCollection (IndexCollection.java:270) - Pretokenized?: false\n", | ||
"2023-07-21 07:31:17,524 INFO [main] index.IndexCollection (IndexCollection.java:271) - Index path: /tmp/index\n", | ||
"2023-07-21 07:31:17,527 INFO [main] index.IndexCollection (IndexCollection.java:309) - ============ Indexing Collection ============\n", | ||
"2023-07-21 07:31:17,819 INFO [main] index.IndexCollection (IndexCollection.java:424) - Thread pool with 1 threads initialized.\n", | ||
"2023-07-21 07:31:17,819 INFO [main] index.IndexCollection (IndexCollection.java:426) - Initializing collection in /tmp/anserini-docs\n", | ||
"2023-07-21 07:31:17,821 INFO [main] index.IndexCollection (IndexCollection.java:435) - 1 file found\n", | ||
"2023-07-21 07:31:17,821 INFO [main] index.IndexCollection (IndexCollection.java:436) - Starting to index...\n", | ||
"2023-07-21 07:31:17,974 DEBUG [pool-2-thread-1] index.IndexCollection$LocalIndexerThread (IndexCollection.java:215) - anserini-docs/part-01.json: 5 docs added.\n", | ||
"2023-07-21 07:31:18,227 INFO [main] index.IndexCollection (IndexCollection.java:492) - Indexing Complete! 5 documents indexed\n", | ||
"2023-07-21 07:31:18,228 INFO [main] index.IndexCollection (IndexCollection.java:493) - ============ Final Counter Values ============\n", | ||
"2023-07-21 07:31:18,229 INFO [main] index.IndexCollection (IndexCollection.java:494) - indexed: 5\n", | ||
"2023-07-21 07:31:18,230 INFO [main] index.IndexCollection (IndexCollection.java:495) - unindexable: 0\n", | ||
"2023-07-21 07:31:18,230 INFO [main] index.IndexCollection (IndexCollection.java:496) - empty: 0\n", | ||
"2023-07-21 07:31:18,231 INFO [main] index.IndexCollection (IndexCollection.java:497) - skipped: 0\n", | ||
"2023-07-21 07:31:18,232 INFO [main] index.IndexCollection (IndexCollection.java:498) - errors: 0\n", | ||
"2023-07-21 07:31:18,250 INFO [main] index.IndexCollection (IndexCollection.java:501) - Total 5 documents indexed in 00:00:00\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"!mkdir -p /tmp/anserini-docs\n", | ||
"\n", | ||
"with open(f'{input_directory}/documents.jsonl') as documents, open(f'/tmp/anserini-docs/part-01.json', 'w') as ans:\n", | ||
" for doc in tqdm(documents):\n", | ||
" doc = json.loads(doc)\n", | ||
" ans.write(json.dumps({\"id\": doc['docno'], \"contents\": doc['text']}) + '\\n')\n", | ||
"\n", | ||
"!python -m pyserini.index.lucene \\\n", | ||
" --collection JsonCollection \\\n", | ||
" --input /tmp/anserini-docs \\\n", | ||
" --index /tmp/index \\\n", | ||
" --generator DefaultLuceneDocumentGenerator \\\n", | ||
" --threads 1 \\\n", | ||
" --storePositions --storeDocvectors\n", | ||
"\n", | ||
"searcher = LuceneSearcher('/tmp/index')\n", | ||
"searcher.set_bm25()\n", | ||
"searcher.set_rm3()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "25653b1a", | ||
"metadata": {}, | ||
"source": [ | ||
"### Step 3: Create Run" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"id": "7ad73a92", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"run = []\n", | ||
"\n", | ||
"with open(f'{input_directory}/queries.jsonl') as queries:\n", | ||
" for query in queries:\n", | ||
" query = json.loads(query)\n", | ||
" for doc in searcher.search(query['query'], 1000):\n", | ||
" run += [{\"qid\": query['qid'], \"score\": doc.score, \"docno\": doc.docid}]\n", | ||
"run = pd.DataFrame(run)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "4d828d78", | ||
"metadata": {}, | ||
"source": [ | ||
"### Step 4: Persist Run" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"id": "6524fc70", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"persist_and_normalize_run(run, output_file=output_directory, system_name='BM25+RM3', depth=1000)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.7.13" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Oops, something went wrong.