|
| 1 | +import torch |
| 2 | +import tqdm |
| 3 | +from colbert.infra import Run, RunConfig |
| 4 | +from colbert.infra.launcher import Launcher |
| 5 | +from colbert.modeling.reranker.electra import ElectraReranker |
| 6 | +from colbert.utils.utils import flatten |
| 7 | +from transformers import AutoModelForSequenceClassification, AutoTokenizer |
| 8 | + |
| 9 | +DEFAULT_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2" |
| 10 | + |
| 11 | + |
| 12 | +class Scorer: |
| 13 | + def __init__(self, queries, collection, model=DEFAULT_MODEL, maxlen=180, bsize=256): |
| 14 | + self.queries = queries |
| 15 | + self.collection = collection |
| 16 | + self.model = model |
| 17 | + |
| 18 | + self.maxlen = maxlen |
| 19 | + self.bsize = bsize |
| 20 | + |
| 21 | + def launch(self, qids, pids): |
| 22 | + launcher = Launcher(self._score_pairs_process, return_all=True) |
| 23 | + outputs = launcher.launch(Run().config, qids, pids) |
| 24 | + |
| 25 | + return flatten(outputs) |
| 26 | + |
| 27 | + def _score_pairs_process(self, config, qids, pids): |
| 28 | + assert len(qids) == len(pids), (len(qids), len(pids)) |
| 29 | + share = 1 + len(qids) // config.nranks |
| 30 | + offset = config.rank * share |
| 31 | + endpos = (1 + config.rank) * share |
| 32 | + |
| 33 | + return self._score_pairs( |
| 34 | + qids[offset:endpos], pids[offset:endpos], show_progress=(config.rank < 1) |
| 35 | + ) |
| 36 | + |
| 37 | + def _score_pairs(self, qids, pids, show_progress=False): |
| 38 | + tokenizer = AutoTokenizer.from_pretrained(self.model) |
| 39 | + model = AutoModelForSequenceClassification.from_pretrained(self.model).cuda() |
| 40 | + |
| 41 | + assert len(qids) == len(pids), (len(qids), len(pids)) |
| 42 | + |
| 43 | + scores = [] |
| 44 | + |
| 45 | + model.eval() |
| 46 | + with torch.inference_mode(): |
| 47 | + with torch.cuda.amp.autocast(): |
| 48 | + for offset in tqdm.tqdm( |
| 49 | + range(0, len(qids), self.bsize), disable=(not show_progress) |
| 50 | + ): |
| 51 | + endpos = offset + self.bsize |
| 52 | + |
| 53 | + queries_ = [self.queries[qid] for qid in qids[offset:endpos]] |
| 54 | + passages_ = [self.collection[pid] for pid in pids[offset:endpos]] |
| 55 | + |
| 56 | + features = tokenizer( |
| 57 | + queries_, |
| 58 | + passages_, |
| 59 | + padding="longest", |
| 60 | + truncation=True, |
| 61 | + return_tensors="pt", |
| 62 | + max_length=self.maxlen, |
| 63 | + ).to(model.device) |
| 64 | + |
| 65 | + scores.append(model(**features).logits.flatten()) |
| 66 | + |
| 67 | + scores = torch.cat(scores) |
| 68 | + scores = scores.tolist() |
| 69 | + |
| 70 | + Run().print(f"Returning with {len(scores)} scores") |
| 71 | + |
| 72 | + return scores |
| 73 | + |
| 74 | + |
| 75 | +# LONG-TERM TODO: This can be sped up by sorting by length in advance. |
0 commit comments