Skip to content

Commit

Permalink
full rank for beir
Browse files Browse the repository at this point in the history
  • Loading branch information
mam10eks committed May 3, 2023
1 parent 7aa8d00 commit b68d76c
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 0 deletions.
2 changes: 2 additions & 0 deletions tira-ir-starters/beir/Dockerfile.dres
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@ RUN pip3 install tira==0.0.9 \

COPY beir/reranking.py /reranking.py

COPY beir/full_ranking.py /full_ranking.py

12 changes: 12 additions & 0 deletions tira-ir-starters/beir/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,14 @@ This creates a run file `tira-output/run.txt`, with content like (`cat sample-ou
19335 0 527689 3 0.988582968711853 sentence-transformers/msmarco-roberta-base-ance-firstp-cos_si
```

We also have an jupyter notebook that allows for full-ranking.

```
tira-run \
--input-directory ${PWD}/sample-input-full-rank \
--image webis/tira-ir-starter-beir:0.0.2-msmarco-roberta-base-ance-firstp \
--command '/full_ranking.py --input $inputDataset --output $outputDir --score_function cos_sim'
```

## Submit the Image to TIRA

Expand Down Expand Up @@ -66,6 +74,10 @@ The remaining variants are:
docker build --build-arg DRES_MODEL=sentence-transformers/msmarco-roberta-base-ance-firstp -t webis/tira-ir-starter-beir:0.0.1-msmarco-roberta-base-ance-firstp -f beir/Dockerfile.dres .
```

```
docker build --build-arg DRES_MODEL=sentence-transformers/msmarco-roberta-base-ance-firstp -t webis/tira-ir-starter-beir:0.0.2-msmarco-roberta-base-ance-firstp -f beir/Dockerfile.dres .
```

```
docker build --build-arg DRES_MODEL=sentence-transformers/msmarco-roberta-base-ance-firstp -t webis/tira-ir-starter-beir:0.0.1-msmarco-roberta-base-ance-firstp -f beir/Dockerfile.dres .
```
Expand Down
54 changes: 54 additions & 0 deletions tira-ir-starters/beir/full_ranking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/usr/bin/env python3
import os
import argparse
import pandas as pd
from beir.retrieval import models
from tqdm import tqdm
from tira.third_party_integrations import load_rerank_data, persist_and_normalize_run
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES


def parse_args():
parser = argparse.ArgumentParser(prog='Retrieve with DenseRetrievalExactSearch models of BEIR.')

parser.add_argument('--model', default=os.environ['DRES_MODEL'])
parser.add_argument('--input', required=True)
parser.add_argument('--output', required=True)
parser.add_argument('--score_function', choices=['cos_sim', 'dot'], required=True)
parser.add_argument('--batch_size', default=128)
parser.add_argument('--corpus_chunk_size', default=50000)

return vars(parser.parse_args())


def rank(df_queries, df_docs, sbert_model, score_function, batch_size, corpus_chunk_size):
print(f'Rank {len(df_docs)} documents for {len(df_queries)} queries.')
model = DRES(sbert_model, batch_size=int(batch_size), corpus_chunk_size=int(corpus_chunk_size))

corpus = {i['docno']:{'text': i['text']} for _, i in df_docs.iterrows()}
queries = {i['qid']: i['query'] for _, i in df_queries.iterrows()}

scores = model.search(corpus=corpus, queries=queries, top_k=1000, score_function=score_function, return_sorted=True)
ret = []

for qid in scores:
for doc_id in scores[qid]:
ret += [{'qid': qid, 'Q0': 0, 'docno': doc_id, 'score': scores[qid][doc_id]}]

return ret


def main(model, input, output, score_function, batch_size, corpus_chunk_size):
df_docs = pd.read_json(f'{input}/documents.jsonl', lines=True)
df_queries = pd.read_json(f'{input}/queries.jsonl', lines=True)
sbert_model = models.SentenceBERT(model)

ret = rank(df_queries, df_docs, sbert_model, score_function, batch_size, corpus_chunk_size)

persist_and_normalize_run(pd.DataFrame(ret), model + '-' + score_function, output)


if __name__ == '__main__':
args = parse_args()
main(**args)

0 comments on commit b68d76c

Please sign in to comment.