|
| 1 | +from PIL import ImageDraw |
| 2 | +from pycocoevalcap.eval import COCOEvalCap, Bleu, Meteor, Rouge, Cider, Spice |
| 3 | +from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer |
| 4 | +from pycocotools.coco import COCO |
| 5 | + |
| 6 | +# COCO_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr"] # , "SPICE"] |
| 7 | +COCO_METRICS = ["CIDEr"] |
| 8 | + |
| 9 | +import logging |
| 10 | + |
| 11 | +eval_logger = logging.getLogger("lmms-eval") |
| 12 | + |
| 13 | + |
| 14 | +def screenspot_bbox_doc_to_visual(doc): |
| 15 | + bbox = doc["bbox"] |
| 16 | + image = doc["image"].convert("RGB") |
| 17 | + draw = ImageDraw.Draw(image) |
| 18 | + bbox_xy = [bbox[0], bbox[1], bbox[2], bbox[3]] |
| 19 | + draw.rectangle(bbox_xy, outline="red", width=3) |
| 20 | + return [image.convert("RGB")] |
| 21 | + |
| 22 | + |
| 23 | +def screenspot_process_result(doc, result): |
| 24 | + """ |
| 25 | + Args: |
| 26 | + doc: a instance of the eval dataset |
| 27 | + results: [pred] |
| 28 | + Returns: |
| 29 | + a dictionary with key: metric name (in this case coco_bleu), value: metric value |
| 30 | + """ |
| 31 | + pred = result[0] if len(result) > 0 else "" |
| 32 | + ann_id = doc["file_name"] |
| 33 | + data_dict = {"instruction": doc["instruction"], "pred": pred, "ann_id": ann_id, 'data_type': doc['data_type'], 'data_source': doc['data_source']} |
| 34 | + return {f"screenspot_{metric}": data_dict for metric in COCO_METRICS} |
| 35 | + |
| 36 | + |
| 37 | +def screenspot_doc_to_text(doc): |
| 38 | + return f"Direct a user to interact with the highlighted region [{doc['bbox'][0]:.2f}, {doc['bbox'][1]:.2f}, {doc['bbox'][2]:.2f}, {doc['bbox'][3]:.2f}]." |
| 39 | + |
| 40 | + |
| 41 | +def screenspot_aggregation_result(results, metric): |
| 42 | + # scorers = [(Bleu(4), "Bleu_1"), (Bleu(4), "Bleu_2"), (Bleu(4), "Bleu_3"), (Bleu(4), "Bleu_4"), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), (Spice(), "SPICE")] |
| 43 | + scorers = [(Cider(), "CIDEr")] |
| 44 | + scorers_dict = {s[1]: s for s in scorers} |
| 45 | + |
| 46 | + stored_results = [] |
| 47 | + # In order to make the coco eval tools to successfully create index |
| 48 | + # We need at least two dict in the dataset |
| 49 | + # 'annotation' and 'images' |
| 50 | + # 'annotation' exactly reproduce the original annotation |
| 51 | + # 'images' however only need the image id which is contained in the file name |
| 52 | + dataset = {"annotations": [], "images": []} |
| 53 | + idx = 0 |
| 54 | + ann_id = 0 |
| 55 | + for result in results: |
| 56 | + stored_results.append({"image_id": idx, "caption": result["pred"]}) |
| 57 | + # for s in result["answer"]: |
| 58 | + dataset["annotations"].append({"image_id": idx, "caption": result['instruction'], "id": ann_id}) |
| 59 | + ann_id += 1 |
| 60 | + |
| 61 | + dataset["images"].append({"id": idx}) |
| 62 | + idx += 1 |
| 63 | + |
| 64 | + coco = COCO() |
| 65 | + # Manually create index here |
| 66 | + coco.dataset = dataset |
| 67 | + coco.createIndex() |
| 68 | + |
| 69 | + coco_result = coco.loadRes(stored_results) |
| 70 | + coco_eval = COCOEvalCap(coco, coco_result) |
| 71 | + |
| 72 | + imgIds = coco_eval.params["image_id"] |
| 73 | + gts = {} |
| 74 | + res = {} |
| 75 | + for imgId in imgIds: |
| 76 | + gts[imgId] = coco_eval.coco.imgToAnns[imgId] |
| 77 | + res[imgId] = coco_eval.cocoRes.imgToAnns[imgId] |
| 78 | + |
| 79 | + eval_logger.info("tokenization...") |
| 80 | + tokenizer = PTBTokenizer() |
| 81 | + gts = tokenizer.tokenize(gts) |
| 82 | + res = tokenizer.tokenize(res) |
| 83 | + |
| 84 | + eval_logger.info(f"Computing {metric} scores...") |
| 85 | + |
| 86 | + score, scores = scorers_dict[metric][0].compute_score(gts, res) |
| 87 | + # coco_eval.setEval(score, metric) |
| 88 | + |
| 89 | + # When metric is one of the Bleu, score will be a list |
| 90 | + if type(score) == list: |
| 91 | + n = int(metric.split("_")[-1]) |
| 92 | + score = score[n - 1] |
| 93 | + |
| 94 | + return score |
| 95 | + |
| 96 | + |
| 97 | +def screenspot_bleu4(results): |
| 98 | + return screenspot_aggregation_result(results, "Bleu_4") |
| 99 | + |
| 100 | + |
| 101 | +def screenspot_bleu3(results): |
| 102 | + return screenspot_aggregation_result(results, "Bleu_3") |
| 103 | + |
| 104 | + |
| 105 | +def screenspot_bleu2(results): |
| 106 | + return screenspot_aggregation_result(results, "Bleu_2") |
| 107 | + |
| 108 | + |
| 109 | +def screenspot_bleu1(results): |
| 110 | + return screenspot_aggregation_result(results, "Bleu_1") |
| 111 | + |
| 112 | + |
| 113 | +def screenspot_meteor(results): |
| 114 | + return screenspot_aggregation_result(results, "METEOR") |
| 115 | + |
| 116 | + |
| 117 | +def screenspot_rougel(results): |
| 118 | + return screenspot_aggregation_result(results, "ROUGE_L") |
| 119 | + |
| 120 | + |
| 121 | +def screenspot_cider(results): |
| 122 | + return screenspot_aggregation_result(results, "CIDEr") |
| 123 | + |
| 124 | + |
| 125 | +def screenspot_spice(results): |
| 126 | + return screenspot_aggregation_result(results, "SPICE") |
0 commit comments