代码注释完毕

bobo0810 · Sep 10, 2018 · 26754d5 · 26754d5
1 parent 2be5f35
commit 26754d5
Show file tree

Hide file tree

Showing 5 changed files with 153 additions and 69 deletions.
diff --git a/README.md b/README.md
@@ -1,5 +1,31 @@
-# 注：关于本仓库，本人新增内容在最下面（未写）
+
 ## 该仓库为 原作者的内容，本人仅阅读并在代码中加入大量中文注释，以便理解。
+
+- ### 第一次更新：新增本仓库，代码未加完注释
+
+- ### 第二次更新： 整个训练过程、损失计算等加完注释，测试和验证代码量少、简单易懂且与训练代码大致相同，暂时不加。
+
+## 接下来工作：
+
+ - ## [重构代码](https://github.com/bobo0810/AnnotatedNetworkModelGit)，并加入visdom可视化等，敬请期待~
+
+## 参考文献：
+
+推荐配合阅读，效果更佳~
+
+- [从0到1实现YOLOv3（part one）](https://blog.csdn.net/qq_25737169/article/details/80530579)
+
+- [从0到1实现YOLO v3（part two）](https://blog.csdn.net/qq_25737169/article/details/80634360)
+
+- [yolo v3 译文](https://zhuanlan.zhihu.com/p/34945787)
+
+ ## 环境：
+
+ | python版本 | pytorch版本 |
+|------------|-------------|
+| 3.5        | 0.4.1       |
+
+
 # PyTorch-YOLOv3
 Minimal implementation of YOLOv3 in PyTorch.
 

diff --git a/models.py b/models.py
@@ -138,63 +138,86 @@ def __init__(self, anchors, num_classes, img_dim):
         self.num_classes = num_classes  #数据集类别，coco数据集共80类
         self.bbox_attrs = 5 + num_classes  #一个 网格需要预测的值个数
         self.img_dim = img_dim   # 输入训练图像的大小
-        self.ignore_thres = 0.5  #阈值
-        self.lambda_coord = 1  #计算损失时的lambda，一般默认为1
+        self.ignore_thres = 0.5  #  是否为物体的阈值（ 预测结果，即物体置信度小于该阈值，则认为该处没有预测到物体）
+        self.lambda_coord = 1  #计算损失时的lambda，一般默认为1（损失公式中，用于调节 分类  和 检测  的比重）
 
-        self.mse_loss = nn.MSELoss()   #均方误差 损失函数
-        self.bce_loss = nn.BCELoss()  #计算目标和输出之间的二进制交叉熵  损失函数
+        self.mse_loss = nn.MSELoss()   #均方误差 损失函数，计算 检测时的坐标损失
+        self.bce_loss = nn.BCELoss()  #计算目标和输出之间的二进制交叉熵  损失函数，计算  多类别的分类损失
 
     def forward(self, x, targets=None):
+        # yolo有3个检测层13x13,26x26,52x52，这里以 第一个检测层13x13为例
+        # x [16,255,13,13]  16:batch数    255：深度   13x13：feature map大小
         bs = x.size(0)
-        g_dim = x.size(2)
-        stride =  self.img_dim / g_dim
-        # Tensors for cuda support
+        g_dim = x.size(2)  # feature map大小
+        stride =  self.img_dim / g_dim   # feature相对于原图416的缩放倍数   32
+        # Tensors for cuda support   设置初始化tensor的默认类型
         FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
         LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor
 
+        # [16,3,13,13,85]     contiguous返回一个内存连续的有相同数据的 tensor
         prediction = x.view(bs,  self.num_anchors, self.bbox_attrs, g_dim, g_dim).permute(0, 1, 3, 4, 2).contiguous()
 
-        # Get outputs
-        x = torch.sigmoid(prediction[..., 0])          # Center x
-        y = torch.sigmoid(prediction[..., 1])          # Center y
-        w = prediction[..., 2]                         # Width
-        h = prediction[..., 3]                         # Height
-        conf = torch.sigmoid(prediction[..., 4])       # Conf
-        pred_cls = torch.sigmoid(prediction[..., 5:])  # Cls pred.
-
-        # Calculate offsets for each grid
-        grid_x = torch.linspace(0, g_dim-1, g_dim).repeat(g_dim,1).repeat(bs*self.num_anchors, 1, 1).view(x.shape).type(FloatTensor)
-        grid_y = torch.linspace(0, g_dim-1, g_dim).repeat(g_dim,1).t().repeat(bs*self.num_anchors, 1, 1).view(y.shape).type(FloatTensor)
-        scaled_anchors = [(a_w / stride, a_h / stride) for a_w, a_h in self.anchors]
-        anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0]))
-        anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1]))
-        anchor_w = anchor_w.repeat(bs, 1).repeat(1, 1, g_dim*g_dim).view(w.shape)
-        anchor_h = anchor_h.repeat(bs, 1).repeat(1, 1, g_dim*g_dim).view(h.shape)
-
-        # Add offset and scale with anchors
-        pred_boxes = FloatTensor(prediction[..., :4].shape)
+        # Get outputs    85中0-3 为预测的框偏移，4为 物体置信度（是否有物体）  5： 为多类别的分类概率
+        x = torch.sigmoid(prediction[..., 0])          # Center x  [16,3,13,13]
+        y = torch.sigmoid(prediction[..., 1])          # Center y  [16,3,13,13]
+        w = prediction[..., 2]                         # Width     [16,3,13,13]
+        h = prediction[..., 3]                         # Height    [16,3,13,13]
+        conf = torch.sigmoid(prediction[..., 4])       # Conf      [16,3,13,13]
+        pred_cls = torch.sigmoid(prediction[..., 5:])  # Cls pred. [16,3,13,13,80]
+
+        # Calculate offsets for each grid 计算每个网格的偏移量
+        # torch.linspace返回 start 和 end 之间等间隔 steps 点的一维 Tensor
+        # repeat沿着指定的尺寸重复 tensor
+        # 过程：
+        #      torch.linspace(0, g_dim-1, g_dim)  ->  [1,13]的tensor
+        #      repeat(g_dim,1)                    ->  [13,13]的tensor 每行内容为0-12,共13行
+        #      repeat(bs*self.num_anchors, 1, 1)  ->  [48,13,13]的tensor   [13,13]内容不变，在扩展的一维上重复48次
+        #      view(x.shape)                      ->  resize成[16.3.13.13]的tensor
+        # grid_x、grid_y用于 定位 feature map的网格左上角坐标
+        grid_x = torch.linspace(0, g_dim-1, g_dim).repeat(g_dim,1).repeat(bs*self.num_anchors, 1, 1).view(x.shape).type(FloatTensor)    # [16.3.13.13]  每行内容为0-12,共13行
+        grid_y = torch.linspace(0, g_dim-1, g_dim).repeat(g_dim,1).t().repeat(bs*self.num_anchors, 1, 1).view(y.shape).type(FloatTensor)  # [16.3.13.13]  每列内容为0-12,共13列（因为使用转置T）
+        scaled_anchors = [(a_w / stride, a_h / stride) for a_w, a_h in self.anchors]  #将 原图尺度的锚框也缩放到统一尺度下
+        anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0]))  #[3,1]  3个锚框的w值
+        anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1]))  #[3,1]  3个锚框的h值
+        anchor_w = anchor_w.repeat(bs, 1).repeat(1, 1, g_dim*g_dim).view(w.shape) #[16,3,13,13]
+        anchor_h = anchor_h.repeat(bs, 1).repeat(1, 1, g_dim*g_dim).view(h.shape) #[16,3,13,13]
+
+        # Add offset and scale with anchors  给锚框添加偏移量和比例
+        pred_boxes = FloatTensor(prediction[..., :4].shape)  #新建一个tensor[16,3,13,13,4]
+        # pred_boxes为 在13x13的feature map尺度上的预测框
+        # x,y为预测值（网格内的坐标，经过sigmoid之后值为0-1之间） grid_x，grid_y定位网格左上角偏移坐标（值在0-12之间）
         pred_boxes[..., 0] = x.data + grid_x
         pred_boxes[..., 1] = y.data + grid_y
+        # w，h为 预测值，即相对于原锚框的偏移值    anchor_w，anchor_h为 网格对应的3个锚框
         pred_boxes[..., 2] = torch.exp(w.data) * anchor_w
         pred_boxes[..., 3] = torch.exp(h.data) * anchor_h
 
         # Training 训练阶段
         if targets is not None:
 
+            # 将损失函数转到GPU上，第一次见...
             if x.is_cuda:
                 self.mse_loss = self.mse_loss.cuda()
                 self.bce_loss = self.bce_loss.cuda()
-
-            nGT, nCorrect, mask, conf_mask, tx, ty, tw, th, tconf, tcls = build_targets(pred_boxes.cpu().data,
-                                                                            targets.cpu().data,
-                                                                            scaled_anchors,
-                                                                            self.num_anchors,
-                                                                            self.num_classes,
-                                                                            g_dim,
-                                                                            self.ignore_thres,
-                                                                            self.img_dim)
-
+            # nGT 统计一个batch中的真值框个数
+            # nCorrect 统计 一个batch预测出有物体的个数
+            # mask   [16,3,13,13]全0   在3个原始锚框与 真值框 iou最大的那个锚框  对应的预测框位置置为1 ，即  负责检测物体的位置为1
+            # conf_mask  [16,3,13,13]  初始化全1，之后的操作：负责预测物体的网格置为1，它周围网格置为0
+            # tx, ty [16,3,13,13] 初始化全为0，在有真值框的网格位置写入 真实的物体中心点坐标
+            # tw, th  [16,3,13,13] 初始化全为0，该值为 真值框的w、h 按照公式转化为 网络输出时对应的真值（该值对应于 网络输出的真值）
+            # tconf [16,3,13,13]   初始化全0，有真值框对应的网格位置为1  标明 物体中心点落在该网格中，该网格去负责预测物体
+            # tcls    #[16,3,13,13,80]  初始化全0，经过one-hot编码后  在真实类别处值为1
+            nGT, nCorrect, mask, conf_mask, tx, ty, tw, th, tconf, tcls = build_targets(pred_boxes.cpu().data,   #在13x13尺度上的预测框  [16,3,13,13,4]
+                                                                            targets.cpu().data,                  #坐标被归一化后的真值框filled_labels[16,50,5] 值在0-1之间
+                                                                            scaled_anchors,                      #缩放到13x13尺度下的3个锚框
+                                                                            self.num_anchors,                    #锚框个数3
+                                                                            self.num_classes,                    #数据集类别数  coco数据集80
+                                                                            g_dim,                               #feature map相对于原图的缩放倍数13
+                                                                            self.ignore_thres,                   # 阈值（用于判断  真值框 与 3个原始锚框的iou > 阈值）
+                                                                            self.img_dim)                        #网络输入图像的大小 416
+            #  conf[16,3,13,13] 为网络输出值，物体置信度（是否有物体）
             nProposals = int((conf > 0.25).sum().item())
+            # 召回率recall = 预测出有物体 / 真值框总数
             recall = float(nCorrect / nGT) if nGT else 1
 
             # Handle masks
@@ -210,12 +233,15 @@ def forward(self, x, targets=None):
             tconf = Variable(tconf.type(FloatTensor), requires_grad=False)
             tcls  = Variable(tcls.type(FloatTensor), requires_grad=False)
 
-            # Mask outputs to ignore non-existing objects
+            # Mask outputs to ignore non-existing objects  通过掩码来忽略 不存在物体
+            # mask 初始化全为0，只有  在3个原始锚框与 真值框 iou最大的那个锚框  对应的预测框位置置为1，即  负责检测物体的位置为1
             loss_x = self.lambda_coord * self.bce_loss(x * mask, tx * mask)
             loss_y = self.lambda_coord * self.bce_loss(y * mask, ty * mask)
-            loss_w = self.lambda_coord * self.mse_loss(w * mask, tw * mask) / 2
+            loss_w = self.lambda_coord * self.mse_loss(w * mask, tw * mask) / 2   # 为何 /2 ?
             loss_h = self.lambda_coord * self.mse_loss(h * mask, th * mask) / 2
+            # 有无物体损失  conf_mask  [16,3,13,13]  初始化全1，之后的操作：负责预测物体的网格置为1，它周围网格置为0
             loss_conf = self.bce_loss(conf * conf_mask, tconf * conf_mask)
+            # 多分类损失
             loss_cls = self.bce_loss(pred_cls * cls_mask, tcls * cls_mask)
             loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls
 
@@ -244,14 +270,18 @@ def __init__(self, config_path, img_size=416):
         # 解析list，返回 pytorch模型结构
         self.hyperparams, self.module_list = create_modules(self.module_defs)
         self.img_size = img_size
+        # 即训练网络过程中使用的图像总个数 （官方权重内seen值为32013312）
         self.seen = 0
+        # 保存模型时文件头写入的信息（5个字符，其余可不写）
         self.header_info = np.array([0, 0, 0, self.seen, 0])
         self.loss_names = ['x', 'y', 'w', 'h', 'conf', 'cls', 'recall']
 
     def forward(self, x, targets=None):
+        # True: 训练阶段    False:预测阶段
         is_training = targets is not None
         output = []
         self.losses = defaultdict(float)
+        # 保存每一层的网络输出值
         layer_outputs = []
         for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
             if module_def['type'] in ['convolutional', 'upsample']:
@@ -261,7 +291,7 @@ def forward(self, x, targets=None):
                 route 指 按照列来合并tensor,即扩展深度
                 
                 当属性只有一个值时，它会输出由该值索引的网络层的特征图。
-                在我们的示例中，它是−4，因此这个层将从Route层向后输出第4层的特征图。
+                在我们的示例中，它是−4，因此这个层将从Route层开始倒数第4层的特征图。
 
                 当图层有两个值时，它会返回由其值所索引的图层的连接特征图。 
                 在我们的例子中，它是−1,61，并且该图层将输出来自上一层（-1）和第61层的特征图，并沿深度的维度连接。
@@ -280,6 +310,7 @@ def forward(self, x, targets=None):
                 # Train phase: get loss
                 # 训练阶段：获得损失
                 if is_training:
+                    # x为总loss, *losses为各种loss
                     x, *losses = module[0](x, targets)
                     for name, loss in zip(self.loss_names, losses):
                         self.losses[name] += loss
@@ -291,6 +322,8 @@ def forward(self, x, targets=None):
             layer_outputs.append(x)
 
         self.losses['recall'] /= 3
+        # 训练阶段：返回总loss 用于梯度更新
+        # 预测阶段：返回  预测结果
         return sum(output) if is_training else torch.cat(output, 1)
 
 

diff --git a/train.py b/train.py
@@ -86,7 +86,7 @@
 for epoch in range(opt.epochs):
     # 每轮epoch
     for batch_i, (_, imgs, targets) in enumerate(dataloader):
-        # imgs :处理后的图像tensor[16,3,416,416]        targets:真值框filled_labels[16,50,5]
+        # imgs :处理后的图像tensor[16,3,416,416]        targets:坐标被归一化后的真值框filled_labels[16,50,5] 值在0-1之间
         imgs = Variable(imgs.type(Tensor))
         targets = Variable(targets.type(Tensor), requires_grad=False)
         # 优化器梯度清零
@@ -104,6 +104,7 @@
                                     model.losses['h'], model.losses['conf'], model.losses['cls'],
                                     loss.item(), model.losses['recall']))
 
+        # 统计 训练过程共使用多少张图片，用于 保存权重时写入 头文件中
         model.seen += imgs.size(0)
     # 每隔几个模型保存一次
     if epoch % opt.checkpoint_interval == 0:

diff --git a/utils/datasets.py b/utils/datasets.py
@@ -137,7 +137,7 @@ def __getitem__(self, index):
         if labels is not None:
             filled_labels[range(len(labels))[:self.max_objects]] = labels[:self.max_objects]
         filled_labels = torch.from_numpy(filled_labels)
-        # 返回 图像路径、处理后的图像tensor、真值框filled_labels[50,5]
+        # 返回 图像路径、处理后的图像tensor、坐标被归一化后的真值框filled_labels[50,5] 值在0-1之间
         return img_path, input_img, filled_labels
 
     def __len__(self):