Skip to content

Commit e457cfb

Browse files
committed
Create ScreenSpot on clean branch
1 parent d8a3a99 commit e457cfb

7 files changed

+401
-0
lines changed
+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
dataset_path: rootsautomation/ScreenSpot
2+
output_type: generate_until
3+
doc_to_visual: !function utils_rec.screenspot_rec_doc_to_visual
4+
doc_to_text: !function utils_rec.screenspot_rec_doc_to_text
5+
doc_to_target: "bbox"
6+
generation_kwargs:
7+
until:
8+
- "ASSISTANT:"
9+
process_results: !function utils_rec.screenspot_rec_process_result
10+
metric_list:
11+
- metric: screenspot_IoU
12+
aggregation : !function utils_rec.screenspot_rec_iou
13+
higher_is_better : true
14+
15+
aggregation : !function utils_rec.screenspot_rec_acc01
16+
higher_is_better : true
17+
18+
aggregation : !function utils_rec.screenspot_rec_acc03
19+
higher_is_better : true
20+
21+
aggregation : !function utils_rec.screenspot_rec_acc05
22+
higher_is_better : true
23+
24+
aggregation : !function utils_rec.screenspot_rec_acc07
25+
higher_is_better : true
26+
27+
aggregation : !function utils_rec.screenspot_rec_acc09
28+
higher_is_better : true
29+
- metric: screenspot_Center_ACC
30+
aggregation : !function utils_rec.screenspot_rec_center_acc
31+
higher_is_better : true
32+
metadata:
33+
version: '0.0'
+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
dataset_path: rootsautomation/ScreenSpot
2+
output_type: generate_until
3+
doc_to_visual: !function utils.screenspot_bbox_doc_to_visual
4+
doc_to_text: !function utils.screenspot_doc_to_text
5+
doc_to_target: "instruction"
6+
generation_kwargs:
7+
until:
8+
- "ASSISTANT:"
9+
process_results: !function utils.screenspot_process_result
10+
metric_list:
11+
- metric: screenspot_CIDEr
12+
aggregation : !function utils.screenspot_cider
13+
higher_is_better : true
14+
metadata:
15+
version: '0.0'

Diff for: lmms_eval/tasks/screenspot/_screenspot.yaml

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
group: screenspot
2+
task:
3+
- screenspot_reg_test
4+
- screenspot_rec_test

Diff for: lmms_eval/tasks/screenspot/screenspot_rec_test.yaml

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
group: screenspot_rec
2+
task: screenspot_rec_test
3+
include: _default_template_rec_yaml
4+
test_split: test

Diff for: lmms_eval/tasks/screenspot/screenspot_reg_test.yaml

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
group: screenspot_reg
2+
task: screenspot_reg_test
3+
include: _default_template_reg_yaml
4+
test_split: test

Diff for: lmms_eval/tasks/screenspot/utils.py

+126
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
from PIL import ImageDraw
2+
from pycocoevalcap.eval import COCOEvalCap, Bleu, Meteor, Rouge, Cider, Spice
3+
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
4+
from pycocotools.coco import COCO
5+
6+
# COCO_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr"] # , "SPICE"]
7+
COCO_METRICS = ["CIDEr"]
8+
9+
import logging
10+
11+
eval_logger = logging.getLogger("lmms-eval")
12+
13+
14+
def screenspot_bbox_doc_to_visual(doc):
15+
bbox = doc["bbox"]
16+
image = doc["image"].convert("RGB")
17+
draw = ImageDraw.Draw(image)
18+
bbox_xy = [bbox[0], bbox[1], bbox[2], bbox[3]]
19+
draw.rectangle(bbox_xy, outline="red", width=3)
20+
return [image.convert("RGB")]
21+
22+
23+
def screenspot_process_result(doc, result):
24+
"""
25+
Args:
26+
doc: a instance of the eval dataset
27+
results: [pred]
28+
Returns:
29+
a dictionary with key: metric name (in this case coco_bleu), value: metric value
30+
"""
31+
pred = result[0] if len(result) > 0 else ""
32+
ann_id = doc["file_name"]
33+
data_dict = {"instruction": doc["instruction"], "pred": pred, "ann_id": ann_id, 'data_type': doc['data_type'], 'data_source': doc['data_source']}
34+
return {f"screenspot_{metric}": data_dict for metric in COCO_METRICS}
35+
36+
37+
def screenspot_doc_to_text(doc):
38+
return f"Direct a user to interact with the highlighted region [{doc['bbox'][0]:.2f}, {doc['bbox'][1]:.2f}, {doc['bbox'][2]:.2f}, {doc['bbox'][3]:.2f}]."
39+
40+
41+
def screenspot_aggregation_result(results, metric):
42+
# scorers = [(Bleu(4), "Bleu_1"), (Bleu(4), "Bleu_2"), (Bleu(4), "Bleu_3"), (Bleu(4), "Bleu_4"), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), (Spice(), "SPICE")]
43+
scorers = [(Cider(), "CIDEr")]
44+
scorers_dict = {s[1]: s for s in scorers}
45+
46+
stored_results = []
47+
# In order to make the coco eval tools to successfully create index
48+
# We need at least two dict in the dataset
49+
# 'annotation' and 'images'
50+
# 'annotation' exactly reproduce the original annotation
51+
# 'images' however only need the image id which is contained in the file name
52+
dataset = {"annotations": [], "images": []}
53+
idx = 0
54+
ann_id = 0
55+
for result in results:
56+
stored_results.append({"image_id": idx, "caption": result["pred"]})
57+
# for s in result["answer"]:
58+
dataset["annotations"].append({"image_id": idx, "caption": result['instruction'], "id": ann_id})
59+
ann_id += 1
60+
61+
dataset["images"].append({"id": idx})
62+
idx += 1
63+
64+
coco = COCO()
65+
# Manually create index here
66+
coco.dataset = dataset
67+
coco.createIndex()
68+
69+
coco_result = coco.loadRes(stored_results)
70+
coco_eval = COCOEvalCap(coco, coco_result)
71+
72+
imgIds = coco_eval.params["image_id"]
73+
gts = {}
74+
res = {}
75+
for imgId in imgIds:
76+
gts[imgId] = coco_eval.coco.imgToAnns[imgId]
77+
res[imgId] = coco_eval.cocoRes.imgToAnns[imgId]
78+
79+
eval_logger.info("tokenization...")
80+
tokenizer = PTBTokenizer()
81+
gts = tokenizer.tokenize(gts)
82+
res = tokenizer.tokenize(res)
83+
84+
eval_logger.info(f"Computing {metric} scores...")
85+
86+
score, scores = scorers_dict[metric][0].compute_score(gts, res)
87+
# coco_eval.setEval(score, metric)
88+
89+
# When metric is one of the Bleu, score will be a list
90+
if type(score) == list:
91+
n = int(metric.split("_")[-1])
92+
score = score[n - 1]
93+
94+
return score
95+
96+
97+
def screenspot_bleu4(results):
98+
return screenspot_aggregation_result(results, "Bleu_4")
99+
100+
101+
def screenspot_bleu3(results):
102+
return screenspot_aggregation_result(results, "Bleu_3")
103+
104+
105+
def screenspot_bleu2(results):
106+
return screenspot_aggregation_result(results, "Bleu_2")
107+
108+
109+
def screenspot_bleu1(results):
110+
return screenspot_aggregation_result(results, "Bleu_1")
111+
112+
113+
def screenspot_meteor(results):
114+
return screenspot_aggregation_result(results, "METEOR")
115+
116+
117+
def screenspot_rougel(results):
118+
return screenspot_aggregation_result(results, "ROUGE_L")
119+
120+
121+
def screenspot_cider(results):
122+
return screenspot_aggregation_result(results, "CIDEr")
123+
124+
125+
def screenspot_spice(results):
126+
return screenspot_aggregation_result(results, "SPICE")

0 commit comments

Comments
 (0)