yolo update1

abeardear · Jun 30, 2018 · 6615f52 · 6615f52
1 parent 0e5776a
commit 6615f52
Show file tree

Hide file tree

Showing 416 changed files with 32,336 additions and 89 deletions.
diff --git a/README.md b/README.md
@@ -2,12 +2,19 @@
 
 [中文](中文.md)
 
-**This is a testing repository, which can not repeat the result in original [paper](https://arxiv.org/pdf/1506.02640.pdf)**
+**This is a testing repository, which can not repeat the result in original [paper](https://arxiv.org/pdf/1506.02640.pdf), our performance on voc07test is 0.44 map**
 
 **I will lead a discussion later, if you are interested in it, it will be welcome to contact me. If you find any bug in it, please let me know.**
 
 I write this code for learning useage. In yoloLoss.py, i write forward only, with autograd mechanism, backward will be done automatically.
 
+### update
+
+1. Change vgg16 to vgg16_bn
+2. Include voc07_trainval, and i notice that when we evaluate on voc07 test, we should exclude difficult box......
+3. Enlarging the size of the image does not achieve the desired effect, 224x224 can get 0.43 map, 448x448 is 0.44. I change conv1 stride=2 to fit 448 resolution. Better network design may work better.
+4. In yololoss, we want the confidence score to equal the IOU between the predicted box and the ground truth.
+
 ### 1. Dependency
 - pytorch 0.2.0_2
 - opencv
@@ -18,7 +25,8 @@ I write this code for learning useage. In yoloLoss.py, i write forward only, wit
 
 1. Download voc2012train dataset
 2. Download voc2007test dataset
-3. Convert xml annotations to txt file, for the purpose of using dataset.py, you should put the xml_2_txt.py in the same folder of voc dataset, or change *Annotations* path in xml_2_txt.py 
+3. put all images in one folder, i have provide txt annotation file 
+~~3. Convert xml annotations to txt file, for the purpose of using dataset.py, you should put the xml_2_txt.py in the same folder of voc dataset, or change *Annotations* path in xml_2_txt.py~~ 
 
 ### 3. Train
 Run python train.py
@@ -45,10 +53,15 @@ The origin paper use linear activation functiona for the final layer, it's outpu
 Update: I did another experiment. I use linear activation, set learning rate carefully as the paper, and replace sqrt(w), sqrt(h) to (wh) to avoid nan problem. But the result is not good too.
 
 ### 6. result
-1. on the train dataset, map is about 0.5~. Some result image is in trainIMgresult
 
-![](trainIMGresult/2007_000032.jpg)
+Our map in voc2007 test set is 0.44~ some result are below.
+
+![](testimg/000339.jpg)
+
+![](testimg/000356.jpg)
+
+![](testimg/000447.jpg)
 
-2. on the test dataset, map is about 0.2~. Some result image is in testIMGresult. test result is not well.
+![](testimg/000510.jpg)
 
-![](testIMGresult/000004.jpg)
+![](testimg/000586.jpg)
diff --git a/__pycache__/dataset.cpython-36.pyc b/__pycache__/dataset.cpython-36.pyc
diff --git a/__pycache__/net.cpython-36.pyc b/__pycache__/net.cpython-36.pyc
diff --git a/__pycache__/predict.cpython-36.pyc b/__pycache__/predict.cpython-36.pyc
diff --git a/__pycache__/visualize.cpython-36.pyc b/__pycache__/visualize.cpython-36.pyc
diff --git a/__pycache__/yoloLoss.cpython-36.pyc b/__pycache__/yoloLoss.cpython-36.pyc
diff --git a/dataset.py b/dataset.py
@@ -3,7 +3,7 @@
 #created by xiongzihua
 #
 '''
-txt描述文件 image_name.jpg num x y w h c x y w h c 这样就是说一张图片中有两个目标
+txt描述文件 image_name.jpg x y w h c x y w h c 这样就是说一张图片中有两个目标
 '''
 import os
 import sys
@@ -19,7 +19,7 @@
 import cv2
 
 class yoloDataset(data.Dataset):
-    image_size = 224
+    image_size = 448
     def __init__(self,root,list_file,train,transform):
         print('data init')
         self.root=root
@@ -30,21 +30,28 @@ def __init__(self,root,list_file,train,transform):
         self.labels = []
         self.mean = (123,117,104)#RGB
 
+        if isinstance(list_file, list):
+            # Cat multiple list files together.
+            # This is especially useful for voc07/voc12 combination.
+            tmp_file = '/tmp/listfile.txt'
+            os.system('cat %s > %s' % (' '.join(list_file), tmp_file))
+            list_file = tmp_file
+
         with open(list_file) as f:
             lines  = f.readlines()
 
         for line in lines:
             splited = line.strip().split()
             self.fnames.append(splited[0])
-            num_faces = int(splited[1])
+            num_boxes = (len(splited) - 1) // 5
             box=[]
             label=[]
-            for i in range(num_faces):
-                x = float(splited[2+5*i])
-                y = float(splited[3+5*i])
-                x2 = float(splited[4+5*i])
-                y2 = float(splited[5+5*i])
-                c = splited[6+5*i]
+            for i in range(num_boxes):
+                x = float(splited[1+5*i])
+                y = float(splited[2+5*i])
+                x2 = float(splited[3+5*i])
+                y2 = float(splited[4+5*i])
+                c = splited[5+5*i]
                 box.append([x,y,x2,y2])
                 label.append(int(c)+1)
             self.boxes.append(torch.Tensor(box))

diff --git a/eval_voc.py b/eval_voc.py
@@ -3,7 +3,7 @@
 #created by xiongzihua
 #
 import os
-os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 import numpy as np
 VOC_CLASSES = (    # always index 0
     'aeroplane', 'bicycle', 'bird', 'boat',
@@ -137,70 +137,82 @@ def test_eval():
     from predict import *
     from collections import defaultdict
     from tqdm import tqdm
-    from resnet import resnet18
 
     target =  defaultdict(list)
     preds = defaultdict(list)
     image_list = [] #image path list
 
-    f = open('voc2012.txt')
+    # f = open('voc2007test.txt')
+    f = open('voc07_test.txt')
     lines = f.readlines()
     file_list = []
     for line in lines:
         splited = line.strip().split()
         file_list.append(splited)
     f.close()
+
+    f_diff = open('voc07_test_difficult.txt')
+    lines = f_diff.readlines()
+    difficult_list = []
+    for line in lines:
+        splited = line.strip().split()
+        # print(splited)
+        difficult_list.append(splited)
+    f_diff.close()
     print('---prepare target---')
-    for image_file in tqdm(file_list):
+    for index,image_file in enumerate(file_list):
+        image_diff = difficult_list[index]
         image_id = image_file[0]
+        assert image_id == image_diff[0]
+
         image_list.append(image_id)
-        num_obj = int(image_file[1])
+        num_obj = (len(image_file) - 1) // 5
         for i in range(num_obj):
-            x1 = int(image_file[2+5*i])
-            y1 = int(image_file[3+5*i])
-            x2 = int(image_file[4+5*i])
-            y2 = int(image_file[5+5*i])
-            c = int(image_file[6+5*i])
+            difficult = image_diff[i+1]
+            x1 = int(image_file[1+5*i])
+            y1 = int(image_file[2+5*i])
+            x2 = int(image_file[3+5*i])
+            y2 = int(image_file[4+5*i])
+            c = int(image_file[5+5*i])
             class_name = VOC_CLASSES[c]
-            target[(image_id,class_name)].append([x1,y1,x2,y2])
+            if difficult=='1':
+                continue
+            else:
+                target[(image_id,class_name)].append([x1,y1,x2,y2])
     #
     #start test
     #
     print('---start test---')
-    model = vgg16(pretrained=False)
-    model.classifier = nn.Sequential(
-                nn.Linear(512 * 7 * 7, 4096),
-                nn.ReLU(True),
-                nn.Dropout(),
-                #nn.Linear(4096, 4096),
-                #nn.ReLU(True),
-                #nn.Dropout(),
-                nn.Linear(4096, 1470),
-            )
-    '''model = resnet18(pretrained=False)
-    model.fc = nn.Linear(512,1470)'''
+    model = vgg16_bn(pretrained=False)
+    # model.classifier = nn.Sequential(
+    #             nn.Linear(512 * 7 * 7, 4096),
+    #             nn.ReLU(True),
+    #             nn.Dropout(),
+    #             #nn.Linear(4096, 4096),
+    #             #nn.ReLU(True),
+    #             #nn.Dropout(),
+    #             nn.Linear(4096, 1470),
+    #         )
     model.load_state_dict(torch.load('best.pth'))
     model.eval()
     model.cuda()
     count = 0
     for image_path in tqdm(image_list):
-        result = predict_gpu(model,image_path,root_path='/home/xzh/codedata/voc2012train/JPEGImages/') #result[[left_up,right_bottom,class_name,image_path],]
+        result = predict_gpu(model,image_path,root_path='/home/xzh/data/VOCdevkit/VOC2012/allimgs/') #result[[left_up,right_bottom,class_name,image_path],]
         for (x1,y1),(x2,y2),class_name,image_id,prob in result: #image_id is actually image_path
             preds[class_name].append([image_id,prob,x1,y1,x2,y2])
-        '''image = cv2.imread('/home/xzh/codedata/voc2012train/JPEGImages/'+image_path)
+        # print(image_path)
+        image = cv2.imread('/home/xzh/data/VOCdevkit/VOC2012/allimgs/'+image_path)
         for left_up,right_bottom,class_name,_,prob in result:
             color = Color[VOC_CLASSES.index(class_name)]
             cv2.rectangle(image,left_up,right_bottom,color,2)
-            cv2.putText(image,class_name,left_up,cv2.FONT_HERSHEY_SIMPLEX,1,color,1,cv2.LINE_AA)
+            cv2.putText(image,class_name+str(round(prob,2)),left_up,cv2.FONT_HERSHEY_SIMPLEX,0.6,(0,0,0),1,cv2.LINE_AA)
             #print(prob)
 
-            cv2.imwrite('testimg/'+image_path,image)
+        cv2.imwrite('testimg/'+image_path,image)
         count += 1
-        if count == 50:
-            break'''
+        if count == 300:
+            break
 
     print('---start evaluate---')
-    voc_eval(preds,target,VOC_CLASSES=VOC_CLASSES)
-
-
-
+    voc_eval(preds,target,VOC_CLASSES=VOC_CLASSES)
diff --git a/experimentIMG/yoloLoss.svg b/experimentIMG/yoloLoss.svg
diff --git a/log.txt b/log.txt
@@ -0,0 +1,120 @@
+0	6.5415048876116355
+1	6.100706057394705
+2	5.776972419984879
+3	5.554384683793591
+4	5.3748845561858145
+5	5.226670012935515
+6	5.129751951463761
+7	5.028328594084709
+8	4.927295440243136
+9	4.900093263195407
+10	4.782809845093758
+11	4.747922060566564
+12	4.7022608941601165
+13	4.593218278884888
+14	4.5595668500469575
+15	4.5256145277330955
+16	4.487990423940843
+17	4.437278278412357
+18	4.405658186635663
+19	4.345585892277379
+20	4.351668170190627
+21	4.281654350219235
+22	4.271116087513585
+23	4.218694491540232
+24	4.217058497090494
+25	4.18591207842673
+26	4.1442133134411225
+27	4.136817063054731
+28	4.138740881027714
+29	4.141887569427491
+30	4.0587586833584695
+31	4.073217805739372
+32	4.03728237613555
+33	3.9956122844449937
+34	3.9974620434545702
+35	4.023998829626268
+36	4.003252243226575
+37	3.96282632120194
+38	3.972557355511573
+39	3.9570388470926594
+40	3.9371616824980706
+41	3.9387538525366015
+42	3.9152787362375565
+43	3.9202156805223036
+44	3.9222787534036945
+45	3.9003318632802655
+46	3.8846307400734195
+47	3.8660433077043104
+48	3.8885516658906014
+49	3.8739484986951274
+50	3.8828188911561043
+51	3.879196426945348
+52	3.8579012778497512
+53	3.8586445162373204
+54	3.8485875283518145
+55	3.9004847126622355
+56	3.856565999984741
+57	3.8792484175774358
+58	3.8369168358464396
+59	3.8326780919105774
+60	3.8230616754101168
+61	3.8407709090940414
+62	3.863103226692446
+63	3.8657945602170884
+64	3.8421273939071163
+65	3.7986517229387835
+66	3.82103298864057
+67	3.8075707174116564
+68	3.822302801378312
+69	3.8013139247894285
+70	3.805841762788834
+71	3.794689299983363
+72	3.827355009509671
+73	3.8235756274192565
+74	3.8244812150155343
+75	3.7931045101534937
+76	3.8153305530548094
+77	3.811681870491274
+78	3.803573694536763
+79	3.797684003460792
+80	3.724775042072419
+81	3.7086529916332616
+82	3.69718047418902
+83	3.7072122058560772
+84	3.697758961492969
+85	3.695369718151708
+86	3.695834325205895
+87	3.6924753996633712
+88	3.6935014640131305
+89	3.6894948267167615
+90	3.684856927779413
+91	3.6949271040578044
+92	3.6909136433755196
+93	3.684668857820572
+94	3.6938365920897454
+95	3.6776908213092434
+96	3.68652562479819
+97	3.6927355427895825
+98	3.682749411367601
+99	3.68042238989184
+100	3.6780288465561406
+101	3.6768690047725556
+102	3.675544171948587
+103	3.678370831089635
+104	3.6778974602299352
+105	3.67521142344321
+106	3.6865312660894087
+107	3.6777405446575533
+108	3.680706839407644
+109	3.6763052263567526
+110	3.6846513532823133
+111	3.6738818607022687
+112	3.677423651756779
+113	3.6801375196826074
+114	3.67694855043965
+115	3.684765911102295
+116	3.6786774143095937
+117	3.681097923555682
+118	3.6744024461315523
+119	3.679993252600393