From 26754d5f6de2c46b99579cf6daf83d89adb10983 Mon Sep 17 00:00:00 2001
From: bobo <1055271769@qq.com>
Date: Mon, 10 Sep 2018 21:40:10 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E6=B3=A8=E9=87=8A=E5=AE=8C?=
 =?UTF-8?q?=E6=AF=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md         |  28 +++++++++++-
 models.py         | 111 ++++++++++++++++++++++++++++++----------------
 train.py          |   3 +-
 utils/datasets.py |   2 +-
 utils/utils.py    |  78 +++++++++++++++++++++-----------
 5 files changed, 153 insertions(+), 69 deletions(-)

diff --git a/README.md b/README.md
index 3402a31..e9fb1bf 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,31 @@
-# 注：关于本仓库，本人新增内容在最下面（未写）
+
 ## 该仓库为 原作者的内容，本人仅阅读并在代码中加入大量中文注释，以便理解。
+
+- ### 第一次更新：新增本仓库，代码未加完注释
+
+- ### 第二次更新： 整个训练过程、损失计算等加完注释，测试和验证代码量少、简单易懂且与训练代码大致相同，暂时不加。
+
+## 接下来工作：
+
+ - ## [重构代码](https://github.com/bobo0810/AnnotatedNetworkModelGit)，并加入visdom可视化等，敬请期待~
+
+## 参考文献：
+
+推荐配合阅读，效果更佳~
+
+- [从0到1实现YOLOv3（part one）](https://blog.csdn.net/qq_25737169/article/details/80530579)
+
+- [从0到1实现YOLO v3（part two）](https://blog.csdn.net/qq_25737169/article/details/80634360)
+
+- [yolo v3 译文](https://zhuanlan.zhihu.com/p/34945787)
+
+ ## 环境：
+
+ | python版本 | pytorch版本 |
+|------------|-------------|
+| 3.5        | 0.4.1       |
+
+
 # PyTorch-YOLOv3
 Minimal implementation of YOLOv3 in PyTorch.
 
diff --git a/models.py b/models.py
index 741d963..3cbaadb 100644
--- a/models.py
+++ b/models.py
@@ -138,63 +138,86 @@ def __init__(self, anchors, num_classes, img_dim):
         self.num_classes = num_classes  #数据集类别，coco数据集共80类
         self.bbox_attrs = 5 + num_classes  #一个 网格需要预测的值个数
         self.img_dim = img_dim   # 输入训练图像的大小
-        self.ignore_thres = 0.5  #阈值
-        self.lambda_coord = 1  #计算损失时的lambda，一般默认为1
+        self.ignore_thres = 0.5  #  是否为物体的阈值（ 预测结果，即物体置信度小于该阈值，则认为该处没有预测到物体）
+        self.lambda_coord = 1  #计算损失时的lambda，一般默认为1（损失公式中，用于调节 分类  和 检测  的比重）
 
-        self.mse_loss = nn.MSELoss()   #均方误差 损失函数
-        self.bce_loss = nn.BCELoss()  #计算目标和输出之间的二进制交叉熵  损失函数
+        self.mse_loss = nn.MSELoss()   #均方误差 损失函数，计算 检测时的坐标损失
+        self.bce_loss = nn.BCELoss()  #计算目标和输出之间的二进制交叉熵  损失函数，计算  多类别的分类损失
 
     def forward(self, x, targets=None):
+        # yolo有3个检测层13x13,26x26,52x52，这里以 第一个检测层13x13为例
+        # x [16,255,13,13]  16:batch数    255：深度   13x13：feature map大小
         bs = x.size(0)
-        g_dim = x.size(2)
-        stride =  self.img_dim / g_dim
-        # Tensors for cuda support
+        g_dim = x.size(2)  # feature map大小
+        stride =  self.img_dim / g_dim   # feature相对于原图416的缩放倍数   32
+        # Tensors for cuda support   设置初始化tensor的默认类型
         FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
         LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor
 
+        # [16,3,13,13,85]     contiguous返回一个内存连续的有相同数据的 tensor
         prediction = x.view(bs,  self.num_anchors, self.bbox_attrs, g_dim, g_dim).permute(0, 1, 3, 4, 2).contiguous()
 
-        # Get outputs
-        x = torch.sigmoid(prediction[..., 0])          # Center x
-        y = torch.sigmoid(prediction[..., 1])          # Center y
-        w = prediction[..., 2]                         # Width
-        h = prediction[..., 3]                         # Height
-        conf = torch.sigmoid(prediction[..., 4])       # Conf
-        pred_cls = torch.sigmoid(prediction[..., 5:])  # Cls pred.
-
-        # Calculate offsets for each grid
-        grid_x = torch.linspace(0, g_dim-1, g_dim).repeat(g_dim,1).repeat(bs*self.num_anchors, 1, 1).view(x.shape).type(FloatTensor)
-        grid_y = torch.linspace(0, g_dim-1, g_dim).repeat(g_dim,1).t().repeat(bs*self.num_anchors, 1, 1).view(y.shape).type(FloatTensor)
-        scaled_anchors = [(a_w / stride, a_h / stride) for a_w, a_h in self.anchors]
-        anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0]))
-        anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1]))
-        anchor_w = anchor_w.repeat(bs, 1).repeat(1, 1, g_dim*g_dim).view(w.shape)
-        anchor_h = anchor_h.repeat(bs, 1).repeat(1, 1, g_dim*g_dim).view(h.shape)
-
-        # Add offset and scale with anchors
-        pred_boxes = FloatTensor(prediction[..., :4].shape)
+        # Get outputs    85中0-3 为预测的框偏移，4为 物体置信度（是否有物体）  5： 为多类别的分类概率
+        x = torch.sigmoid(prediction[..., 0])          # Center x  [16,3,13,13]
+        y = torch.sigmoid(prediction[..., 1])          # Center y  [16,3,13,13]
+        w = prediction[..., 2]                         # Width     [16,3,13,13]
+        h = prediction[..., 3]                         # Height    [16,3,13,13]
+        conf = torch.sigmoid(prediction[..., 4])       # Conf      [16,3,13,13]
+        pred_cls = torch.sigmoid(prediction[..., 5:])  # Cls pred. [16,3,13,13,80]
+
+        # Calculate offsets for each grid 计算每个网格的偏移量
+        # torch.linspace返回 start 和 end 之间等间隔 steps 点的一维 Tensor
+        # repeat沿着指定的尺寸重复 tensor
+        # 过程：
+        #      torch.linspace(0, g_dim-1, g_dim)  ->  [1,13]的tensor
+        #      repeat(g_dim,1)                    ->  [13,13]的tensor 每行内容为0-12,共13行
+        #      repeat(bs*self.num_anchors, 1, 1)  ->  [48,13,13]的tensor   [13,13]内容不变，在扩展的一维上重复48次
+        #      view(x.shape)                      ->  resize成[16.3.13.13]的tensor
+        # grid_x、grid_y用于 定位 feature map的网格左上角坐标
+        grid_x = torch.linspace(0, g_dim-1, g_dim).repeat(g_dim,1).repeat(bs*self.num_anchors, 1, 1).view(x.shape).type(FloatTensor)    # [16.3.13.13]  每行内容为0-12,共13行
+        grid_y = torch.linspace(0, g_dim-1, g_dim).repeat(g_dim,1).t().repeat(bs*self.num_anchors, 1, 1).view(y.shape).type(FloatTensor)  # [16.3.13.13]  每列内容为0-12,共13列（因为使用转置T）
+        scaled_anchors = [(a_w / stride, a_h / stride) for a_w, a_h in self.anchors]  #将 原图尺度的锚框也缩放到统一尺度下
+        anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0]))  #[3,1]  3个锚框的w值
+        anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1]))  #[3,1]  3个锚框的h值
+        anchor_w = anchor_w.repeat(bs, 1).repeat(1, 1, g_dim*g_dim).view(w.shape) #[16,3,13,13]
+        anchor_h = anchor_h.repeat(bs, 1).repeat(1, 1, g_dim*g_dim).view(h.shape) #[16,3,13,13]
+
+        # Add offset and scale with anchors  给锚框添加偏移量和比例
+        pred_boxes = FloatTensor(prediction[..., :4].shape)  #新建一个tensor[16,3,13,13,4]
+        # pred_boxes为 在13x13的feature map尺度上的预测框
+        # x,y为预测值（网格内的坐标，经过sigmoid之后值为0-1之间） grid_x，grid_y定位网格左上角偏移坐标（值在0-12之间）
         pred_boxes[..., 0] = x.data + grid_x
         pred_boxes[..., 1] = y.data + grid_y
+        # w，h为 预测值，即相对于原锚框的偏移值    anchor_w，anchor_h为 网格对应的3个锚框
         pred_boxes[..., 2] = torch.exp(w.data) * anchor_w
         pred_boxes[..., 3] = torch.exp(h.data) * anchor_h
 
         # Training 训练阶段
         if targets is not None:
 
+            # 将损失函数转到GPU上，第一次见...
             if x.is_cuda:
                 self.mse_loss = self.mse_loss.cuda()
                 self.bce_loss = self.bce_loss.cuda()
-
-            nGT, nCorrect, mask, conf_mask, tx, ty, tw, th, tconf, tcls = build_targets(pred_boxes.cpu().data,
-                                                                            targets.cpu().data,
-                                                                            scaled_anchors,
-                                                                            self.num_anchors,
-                                                                            self.num_classes,
-                                                                            g_dim,
-                                                                            self.ignore_thres,
-                                                                            self.img_dim)
-
+            # nGT 统计一个batch中的真值框个数
+            # nCorrect 统计 一个batch预测出有物体的个数
+            # mask   [16,3,13,13]全0   在3个原始锚框与 真值框 iou最大的那个锚框  对应的预测框位置置为1 ，即  负责检测物体的位置为1
+            # conf_mask  [16,3,13,13]  初始化全1，之后的操作：负责预测物体的网格置为1，它周围网格置为0
+            # tx, ty [16,3,13,13] 初始化全为0，在有真值框的网格位置写入 真实的物体中心点坐标
+            # tw, th  [16,3,13,13] 初始化全为0，该值为 真值框的w、h 按照公式转化为 网络输出时对应的真值（该值对应于 网络输出的真值）
+            # tconf [16,3,13,13]   初始化全0，有真值框对应的网格位置为1  标明 物体中心点落在该网格中，该网格去负责预测物体
+            # tcls    #[16,3,13,13,80]  初始化全0，经过one-hot编码后  在真实类别处值为1
+            nGT, nCorrect, mask, conf_mask, tx, ty, tw, th, tconf, tcls = build_targets(pred_boxes.cpu().data,   #在13x13尺度上的预测框  [16,3,13,13,4]
+                                                                            targets.cpu().data,                  #坐标被归一化后的真值框filled_labels[16,50,5] 值在0-1之间
+                                                                            scaled_anchors,                      #缩放到13x13尺度下的3个锚框
+                                                                            self.num_anchors,                    #锚框个数3
+                                                                            self.num_classes,                    #数据集类别数  coco数据集80
+                                                                            g_dim,                               #feature map相对于原图的缩放倍数13
+                                                                            self.ignore_thres,                   # 阈值（用于判断  真值框 与 3个原始锚框的iou > 阈值）
+                                                                            self.img_dim)                        #网络输入图像的大小 416
+            #  conf[16,3,13,13] 为网络输出值，物体置信度（是否有物体）
             nProposals = int((conf > 0.25).sum().item())
+            # 召回率recall = 预测出有物体 / 真值框总数
             recall = float(nCorrect / nGT) if nGT else 1
 
             # Handle masks
@@ -210,12 +233,15 @@ def forward(self, x, targets=None):
             tconf = Variable(tconf.type(FloatTensor), requires_grad=False)
             tcls  = Variable(tcls.type(FloatTensor), requires_grad=False)
 
-            # Mask outputs to ignore non-existing objects
+            # Mask outputs to ignore non-existing objects  通过掩码来忽略 不存在物体
+            # mask 初始化全为0，只有  在3个原始锚框与 真值框 iou最大的那个锚框  对应的预测框位置置为1，即  负责检测物体的位置为1
             loss_x = self.lambda_coord * self.bce_loss(x * mask, tx * mask)
             loss_y = self.lambda_coord * self.bce_loss(y * mask, ty * mask)
-            loss_w = self.lambda_coord * self.mse_loss(w * mask, tw * mask) / 2
+            loss_w = self.lambda_coord * self.mse_loss(w * mask, tw * mask) / 2   # 为何 /2 ?
             loss_h = self.lambda_coord * self.mse_loss(h * mask, th * mask) / 2
+            # 有无物体损失  conf_mask  [16,3,13,13]  初始化全1，之后的操作：负责预测物体的网格置为1，它周围网格置为0
             loss_conf = self.bce_loss(conf * conf_mask, tconf * conf_mask)
+            # 多分类损失
             loss_cls = self.bce_loss(pred_cls * cls_mask, tcls * cls_mask)
             loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls
 
@@ -244,14 +270,18 @@ def __init__(self, config_path, img_size=416):
         # 解析list，返回 pytorch模型结构
         self.hyperparams, self.module_list = create_modules(self.module_defs)
         self.img_size = img_size
+        # 即训练网络过程中使用的图像总个数 （官方权重内seen值为32013312）
         self.seen = 0
+        # 保存模型时文件头写入的信息（5个字符，其余可不写）
         self.header_info = np.array([0, 0, 0, self.seen, 0])
         self.loss_names = ['x', 'y', 'w', 'h', 'conf', 'cls', 'recall']
 
     def forward(self, x, targets=None):
+        # True: 训练阶段    False:预测阶段
         is_training = targets is not None
         output = []
         self.losses = defaultdict(float)
+        # 保存每一层的网络输出值
         layer_outputs = []
         for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
             if module_def['type'] in ['convolutional', 'upsample']:
@@ -261,7 +291,7 @@ def forward(self, x, targets=None):
                 route 指 按照列来合并tensor,即扩展深度
                 
                 当属性只有一个值时，它会输出由该值索引的网络层的特征图。
-                在我们的示例中，它是−4，因此这个层将从Route层向后输出第4层的特征图。
+                在我们的示例中，它是−4，因此这个层将从Route层开始倒数第4层的特征图。
 
                 当图层有两个值时，它会返回由其值所索引的图层的连接特征图。 
                 在我们的例子中，它是−1,61，并且该图层将输出来自上一层（-1）和第61层的特征图，并沿深度的维度连接。
@@ -280,6 +310,7 @@ def forward(self, x, targets=None):
                 # Train phase: get loss
                 # 训练阶段：获得损失
                 if is_training:
+                    # x为总loss, *losses为各种loss
                     x, *losses = module[0](x, targets)
                     for name, loss in zip(self.loss_names, losses):
                         self.losses[name] += loss
@@ -291,6 +322,8 @@ def forward(self, x, targets=None):
             layer_outputs.append(x)
 
         self.losses['recall'] /= 3
+        # 训练阶段：返回总loss 用于梯度更新
+        # 预测阶段：返回  预测结果
         return sum(output) if is_training else torch.cat(output, 1)
 
 
diff --git a/train.py b/train.py
index 0748988..081b8a3 100644
--- a/train.py
+++ b/train.py
@@ -86,7 +86,7 @@
 for epoch in range(opt.epochs):
     # 每轮epoch
     for batch_i, (_, imgs, targets) in enumerate(dataloader):
-        # imgs :处理后的图像tensor[16,3,416,416]        targets:真值框filled_labels[16,50,5]
+        # imgs :处理后的图像tensor[16,3,416,416]        targets:坐标被归一化后的真值框filled_labels[16,50,5] 值在0-1之间
         imgs = Variable(imgs.type(Tensor))
         targets = Variable(targets.type(Tensor), requires_grad=False)
         # 优化器梯度清零
@@ -104,6 +104,7 @@
                                     model.losses['h'], model.losses['conf'], model.losses['cls'],
                                     loss.item(), model.losses['recall']))
 
+        # 统计 训练过程共使用多少张图片，用于 保存权重时写入 头文件中
         model.seen += imgs.size(0)
     # 每隔几个模型保存一次
     if epoch % opt.checkpoint_interval == 0:
diff --git a/utils/datasets.py b/utils/datasets.py
index df03db6..ca63c0f 100644
--- a/utils/datasets.py
+++ b/utils/datasets.py
@@ -137,7 +137,7 @@ def __getitem__(self, index):
         if labels is not None:
             filled_labels[range(len(labels))[:self.max_objects]] = labels[:self.max_objects]
         filled_labels = torch.from_numpy(filled_labels)
-        # 返回 图像路径、处理后的图像tensor、真值框filled_labels[50,5]
+        # 返回 图像路径、处理后的图像tensor、坐标被归一化后的真值框filled_labels[50,5] 值在0-1之间
         return img_path, input_img, filled_labels
 
     def __len__(self):
diff --git a/utils/utils.py b/utils/utils.py
index 641290d..c7425a9 100644
--- a/utils/utils.py
+++ b/utils/utils.py
@@ -143,70 +143,94 @@ def non_max_suppression(prediction, num_classes, conf_thres=0.5, nms_thres=0.4):
             max_detections = torch.cat(max_detections).data
             # Add max detections to outputs
             output[image_i] = max_detections if output[image_i] is None else torch.cat((output[image_i], max_detections))
-
     return output
 
 def build_targets(pred_boxes, target, anchors, num_anchors, num_classes, dim, ignore_thres, img_dim):
-    nB = target.size(0)
-    nA = num_anchors
-    nC = num_classes
-    dim = dim
-    mask        = torch.zeros(nB, nA, dim, dim)
-    conf_mask   = torch.ones(nB, nA, dim, dim)
-    tx          = torch.zeros(nB, nA, dim, dim)
-    ty          = torch.zeros(nB, nA, dim, dim)
-    tw          = torch.zeros(nB, nA, dim, dim)
-    th          = torch.zeros(nB, nA, dim, dim)
-    tconf       = torch.zeros(nB, nA, dim, dim)
-    tcls        = torch.zeros(nB, nA, dim, dim, num_classes)
-
-    nGT = 0
-    nCorrect = 0
+    nB = target.size(0)  #batch个数  16
+    nA = num_anchors     #锚框个数   3
+    nC = num_classes     #数据集类别数  80
+    dim = dim            #feature map相对于原图的缩放倍数13
+
+    # 初始化参数
+    mask        = torch.zeros(nB, nA, dim, dim)     #[16,3,13,13]   全0
+    conf_mask   = torch.ones(nB, nA, dim, dim)      #[16,3,13,13]   全1
+    tx          = torch.zeros(nB, nA, dim, dim)     #[16,3,13,13]   全0
+    ty          = torch.zeros(nB, nA, dim, dim)     #[16,3,13,13]   全0
+    tw          = torch.zeros(nB, nA, dim, dim)     #[16,3,13,13]   全0
+    th          = torch.zeros(nB, nA, dim, dim)     #[16,3,13,13]   全0
+    tconf       = torch.zeros(nB, nA, dim, dim)     #[16,3,13,13]   全0
+    tcls        = torch.zeros(nB, nA, dim, dim, num_classes)    #[16,3,13,13,80]  全0
+
+    # 为了计算一个batch中的recall召回率
+    nGT = 0  # 统计 真值框个数 GT ground truth
+    nCorrect = 0  # 统计 预测出有物体的个数 （即 真值框 与 3个原始锚框与真值框iou最大的那个锚框对应的预测框  之间的iou > 0.5 为预测正确）
+
+    # 遍历每一张图片
     for b in range(nB):
+        #遍历一张图片的所有物体
         for t in range(target.shape[1]):
             if target[b, t].sum() == 0:
+                # 即代表遍历完所有物体，continue直接开始下一次for循环(译者：使用break直接结束for循环更好)
                 continue
             nGT += 1
             # Convert to position relative to box
+            # target真值框 坐标被归一化后[16,50,5] 值在0-1之间。故乘以 dim  将尺度转化为  13x13尺度下的真值框
             gx = target[b, t, 1] * dim
             gy = target[b, t, 2] * dim
             gw = target[b, t, 3] * dim
             gh = target[b, t, 4] * dim
-            # Get grid box indices
+            # Get grid box indices 向下取整，获取网格框索引，即左上角偏移坐标
             gi = int(gx)
             gj = int(gy)
-            # Get shape of gt box
+            # Get shape of gt box [1,4]
             gt_box = torch.FloatTensor(np.array([0, 0, gw, gh])).unsqueeze(0)
-            # Get shape of anchor box
+            # Get shape of anchor box [3,4]   前两列全为0  后两列为 三个anchor的w、h
             anchor_shapes = torch.FloatTensor(np.concatenate((np.zeros((len(anchors), 2)), np.array(anchors)), 1))
             # Calculate iou between gt and anchor shapes
+            # 计算 一个真值框 与  对应的3个原始锚框  之间的iou
             anch_ious = bbox_iou(gt_box, anchor_shapes)
-            # Where the overlap is larger than threshold set mask to zero (ignore)
+            # Where the overlap is larger than threshold set mask to zero (ignore)   当iou重叠率>阈值，则置为0
+            # conf_mask全为1 [16,3,13,13]  当一个真值框 与  一个原始锚框  之间的iou > 阈值时，则置为0。
+            # 即 将 负责预测物体的网格及 它周围的网格 都置为0 不参与训练，后面的代码会 将负责预测物体的网格再置为1。
             conf_mask[b, anch_ious > ignore_thres] = 0
-            # Find the best matching anchor box
+            # Find the best matching anchor box  找到 一个真值框 与  对应的3个原始锚框  之间的iou最大的  下标值
             best_n = np.argmax(anch_ious)
-            # Get ground truth box
+            # Get ground truth box [1,4]
             gt_box = torch.FloatTensor(np.array([gx, gy, gw, gh])).unsqueeze(0)
-            # Get the best prediction
+            # Get the best prediction  [1,4]
+            # pred_boxes:在13x13尺度上的预测框
+            # pred_box：取出  3个原始锚框与 真值框 iou最大的那个锚框  对应的预测框
             pred_box = pred_boxes[b, best_n, gj, gi].unsqueeze(0)
-            # Masks
+            # Masks   [16,3,13,13]   全0      在3个原始锚框与 真值框 iou最大的那个锚框  对应的预测框位，即 负责预测物体的网格置为1 （此时它周围网格为0，思想类似nms）
             mask[b, best_n, gj, gi] = 1
+            #  [16,3,13,13]   全1 然后将 负责预测物体的网格及 它周围的网格 都置为0 不参与训练 ，然后  将负责预测物体的网格再次置为1。
+            #  即总体思想为： 负责预测物体的网格 位置置为1，它周围的网格置为0。类似NMS 非极大值抑制
             conf_mask[b, best_n, gj, gi] = 1
-            # Coordinates
+            # Coordinates 坐标     gi= gx的向下取整。  gx-gi、gy-gj 为 网格内的 物体中心点坐标（0-1之间）
+            # tx  ty初始化全为0，在有真值框的网格位置写入 真实的物体中心点坐标
             tx[b, best_n, gj, gi] = gx - gi
             ty[b, best_n, gj, gi] = gy - gj
             # Width and height
+            #  论文中 13x13尺度下真值框=原始锚框 x 以e为底的 预测值。故预测值= log(13x13尺度下真值框  / 原始锚框  +  1e-16 )
             tw[b, best_n, gj, gi] = math.log(gw/anchors[best_n][0] + 1e-16)
             th[b, best_n, gj, gi] = math.log(gh/anchors[best_n][1] + 1e-16)
             # One-hot encoding of label
             tcls[b, best_n, gj, gi, int(target[b, t, 0])] = 1
-            # Calculate iou between ground truth and best matching prediction
+            # Calculate iou between ground truth and best matching prediction 计算真值框 与   3个原始锚框与真值框iou最大的那个锚框对应的预测框    之间的iou
             iou = bbox_iou(gt_box, pred_box, x1y1x2y2=False)
+            # [16,3,13,13]   全0，有真值框对应的网格位置为1  标明 物体中心点落在该网格中，该网格去负责预测物体
             tconf[b, best_n, gj, gi] = 1
 
             if iou > 0.5:
                 nCorrect += 1
-
+    # nGT 统计一个batch中的真值框个数
+    # nCorrect 统计 一个batch预测出有物体的个数
+    # mask   [16,3,13,13] 初始化全0   在3个原始锚框与 真值框 iou最大的那个锚框  对应的预测框位置置为1
+    # conf_mask  [16,3,13,13]  初始化全1，之后的操作：负责预测物体的网格置为1，它周围网格置为0
+    # tx, ty [16,3,13,13] 初始化全为0，在有真值框的网格位置写入 真实的物体中心点坐标
+    # tw, th  [16,3,13,13] 初始化全为0，该值为 真值框的w、h 按照公式转化为 网络输出时对应的真值（该值对应于 网络输出的真值）
+    # tconf [16,3,13,13]   初始化全0，有真值框对应的网格位置为1  标明 物体中心点落在该网格中，该网格去负责预测物体
+    # tcls    #[16,3,13,13,80]  初始化全0，经过one-hot编码后  在真实类别处值为1
     return nGT, nCorrect, mask, conf_mask, tx, ty, tw, th, tconf, tcls
 
 def to_categorical(y, num_classes):