forked from ntrang086/image_captioning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathevaluation.py
119 lines (92 loc) · 3.74 KB
/
evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from torchvision import transforms
import torch
import torch.nn as nn
import os
import io
from tqdm import tqdm
from data_loader import get_loader
from model import EncoderCNN, DecoderRNN
from utils import clean_sentence
import json
from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap
from collections import namedtuple
def evaluate(loader, encoder, decoder, criterion, vocab):
"""Evaluate the model for one epoch using the provided parameters.
Return the epoch's average validation loss and Bleu-4 score."""
# Switch to validation mode
encoder.eval()
decoder.eval()
cocoRes = COCO()
anns = []
# Disable gradient calculation because we are in inference mode
with torch.no_grad():
for batch in tqdm(loader):
images, img_id = batch[0], batch[1]
# Move to GPU if CUDA is available
if torch.cuda.is_available():
images = images.cuda()
# Pass the inputs through the CNN-RNN model
features = encoder(images).unsqueeze(1)
for i in range(img_id.size()[0]):
slice = features[i].unsqueeze(0)
outputs = decoder.sample_beam_search(slice)
sentence = clean_sentence(outputs[0], vocab)
id = img_id[i].item()
#print('id: {}, cap: {}'.format(id, sentence))
anns.append({'image_id': id, 'caption': sentence})
for id, ann in enumerate(anns):
ann['id'] = id
cocoRes.dataset['annotations'] = anns
cocoRes.createIndex()
return cocoRes
transform_val = transforms.Compose([
transforms.Resize(256), # smaller edge of image resized to 256
transforms.CenterCrop(224), # get 224x224 crop from the center
transforms.ToTensor(), # convert the PIL Image to a tensor
transforms.Normalize((0.485, 0.456, 0.406), # normalize image for pre-trained model
(0.229, 0.224, 0.225))])
batch_size = 128
vocab_threshold = 5
embed_size = 512 # dimensionality of image and word embeddings
hidden_size = 512 # number of features in hidden state of the RNN decoder
num_epochs = 10 # number of training epochs
annFile = "coco/annotations/captions_val2014.json"
coco = COCO(annFile)
loader = get_loader(transform=transform_val,
mode='val',
batch_size=batch_size,
vocab_threshold=vocab_threshold,
vocab_from_file=True)
vocab_size = len(loader.dataset.vocab)
print('vocabulary size: {}'.format(vocab_size))
encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
if torch.cuda.is_available():
decoder.cuda()
encoder.cuda()
criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss()
if torch.cuda.is_available():
map_location=torch.device('cuda')
else:
map_location=torch.device('cpu')
checkpoint = torch.load(os.path.join('./models', 'best-model.pkl'), map_location=map_location)
# Load the pre-trained weights
encoder.load_state_dict(checkpoint['encoder'])
decoder.load_state_dict(checkpoint['decoder'])
# Create captions for validation images
cocoRes = evaluate(loader, encoder, decoder, criterion, loader.dataset.vocab)
cocoEval = COCOEvalCap(coco, cocoRes)
imgIds = set([ann['image_id'] for ann in cocoRes.dataset['annotations']])
cocoEval.params['image_id'] = imgIds
cocoEval.evaluate()
def getCIDEr(ann):
return ann['cider']
anns = cocoRes.dataset['annotations']
for ann in anns:
ann['cider'] = cocoEval.imgToEval[ann['image_id']]['CIDEr']
anns.sort(key = getCIDEr)
result = { 'eval': cocoEval.eval, 'annotations': anns}
f = open('result.json', 'wt')
json.dump(result, f, indent=1)
f.close()