Yolov3--Darknet53实战

发布时间 2023-08-01 15:47:36作者: Frommoon

Yolov3取消池化和全连接层,全部由53个卷积层组成,又名Darknet53,采用多scale,每个scale包含三种候选框,对不同的特征图进行融合后再预测(感受野大的上采样后与感受野相对较小的融合)。利用coco数据集对模型进行训练,最后返回物体所在位置以及物体的类别(回归和分类预测)。

1.数据预处理

  • 数据是训练时一批一批读进来的,是在传入模型的时候再做处理的,不是一开始就把数据全部处理好的。
  • 首先拿到实际的图像数据并转换成tensor格式,其次,根据输入进来的图片大小把它填充成正方形;同时读入图片对应的标签文件(标签数据就是框的位置),把标签数据转换成tensor格式后把当前的坐标变成做完填充后的坐标,最后,把x1y1x2y2转换成xywh(框的左上、右下角坐标转换成中心点坐标加wh),并对图像做增强。
    def __getitem__(self, index):#读入数据和标签
        # ---------
        #  Image
        # ---------
        img_path = self.img_files[index % len(self.img_files)].rstrip()
        img_path = 'C:\\Users\\lus\\AA-project\\pytorch\\PyTorch-YOLOv3\\data\\coco' + img_path #拿到实际图像的数据
        #print (img_path)
        # Extract image as PyTorch tensor
        img = transforms.ToTensor()(Image.open(img_path).convert('RGB'))#数据转换成tensor格式

        # 数据预处理Handle images with less than three channels
        if len(img.shape) != 3:
            img = img.unsqueeze(0)
            img = img.expand((3, img.shape[1:]))

        _, h, w = img.shape #输入进来图片大小  _:3   h:375   w:500
        h_factor, w_factor = (h, w) if self.normalized_labels else (1, 1)
        # 做填充使输入进来的长方形图片变成正方形Pad to square resolution
        img, pad = pad_to_square(img, 0)
        _, padded_h, padded_w = img.shape # padded_h:500   padded_w:500
        # ---------
        #  Label
        # ---------
        #处理标签,标签是对应图片的
        label_path = self.label_files[index % len(self.img_files)].rstrip()
        label_path = 'C:\\Users\\lus\\AA-project\\pytorch\\PyTorch-YOLOv3\\data\\coco\\labels' + label_path
        #print (label_path)

        targets = None
        if os.path.exists(label_path):#标签数据就是框的位置
            boxes = torch.from_numpy(np.loadtxt(label_path).reshape(-1, 5))#将标签数据转换成tensor格式,第一个位置的数字表示物体对应的索引,其余表示:xywh
            # 把当前的坐标变成做完padding后的坐标Extract coordinates for unpadded + unscaled image
            x1 = w_factor * (boxes[:, 1] - boxes[:, 3] / 2)#当前坐标
            y1 = h_factor * (boxes[:, 2] - boxes[:, 4] / 2)
            x2 = w_factor * (boxes[:, 1] + boxes[:, 3] / 2)
            y2 = h_factor * (boxes[:, 2] + boxes[:, 4] / 2)
            # Adjust for added padding
            x1 += pad[0]#padding后的坐标
            y1 += pad[2]
            x2 += pad[1]
            y2 += pad[3]
            # 把x1y1x2y2转换成xywh  Returns (x, y, w, h)
            boxes[:, 1] = ((x1 + x2) / 2) / padded_w#中心点的x
            boxes[:, 2] = ((y1 + y2) / 2) / padded_h#中心点的y
            boxes[:, 3] *= w_factor / padded_w #w
            boxes[:, 4] *= h_factor / padded_h #h

            targets = torch.zeros((len(boxes), 6))
            targets[:, 1:] = boxes

        # 图像增强Apply augmentations
        if self.augment:
            if np.random.random() < 0.5:
                img, targets = horisontal_flip(img, targets)

        return img_path, img, targets #返回图像数据和标签

2.构建网络结构

  • 创建模型,按配置文件的顺序逐层定义好每层的一些参数
def create_modules(module_defs):
    """
    Constructs module list of layer blocks from module configuration in module_defs
    """
    hyperparams = module_defs.pop(0)#标记一下配置文件中刚开始的数据是一些超参数
    output_filters = [int(hyperparams["channels"])]
    module_list = nn.ModuleList()#按顺序往里搭模块
    for module_i, module_def in enumerate(module_defs):#遍历配置文件中的每一块(一块包括:卷积+BN+RELu)
        modules = nn.Sequential()

        if module_def["type"] == "convolutional":#判断每一模块的type
            bn = int(module_def["batch_normalize"])#加BN
            filters = int(module_def["filters"])
            kernel_size = int(module_def["size"])
            pad = (kernel_size - 1) // 2
            modules.add_module(
                f"conv_{module_i}",
                nn.Conv2d(
                    in_channels=output_filters[-1],#输入通道数,输出特征图数
                    out_channels=filters,
                    kernel_size=kernel_size,
                    stride=int(module_def["stride"]),
                    padding=pad,
                    bias=not bn,#加了BN暂不考虑偏置项
                ),
            )
            if bn:#判断每一模块的type
                modules.add_module(f"batch_norm_{module_i}", nn.BatchNorm2d(filters, momentum=0.9, eps=1e-5))
            if module_def["activation"] == "leaky":#判断每一模块的type
                modules.add_module(f"leaky_{module_i}", nn.LeakyReLU(0.1))

        elif module_def["type"] == "maxpool":
            kernel_size = int(module_def["size"])
            stride = int(module_def["stride"])
            if kernel_size == 2 and stride == 1:
                modules.add_module(f"_debug_padding_{module_i}", nn.ZeroPad2d((0, 1, 0, 1)))
            maxpool = nn.MaxPool2d(kernel_size=kernel_size, stride=stride, padding=int((kernel_size - 1) // 2))
            modules.add_module(f"maxpool_{module_i}", maxpool)

        elif module_def["type"] == "upsample":
            upsample = Upsample(scale_factor=int(module_def["stride"]), mode="nearest")#定义上采样层
            modules.add_module(f"upsample_{module_i}", upsample)
        #route层完成维度的拼接操作,感受野较大的做上采样与感受野较小的拼接上(老的回过头指导年轻的)
        elif module_def["type"] == "route": # 输入1:26*26*256 输入2:26*26*128  输出:26*26*(256+128)
            layers = [int(x) for x in module_def["layers"].split(",")]
            filters = sum([output_filters[1:][i] for i in layers])
            modules.add_module(f"route_{module_i}", EmptyLayer())#创建空的层,先占位置,前向传播再用
        #shortcut层是数值的相加,配置文件中写的与上面第三层做残差连接,如果中间学的不好,直接上面的第三次与现在的数值相加
        elif module_def["type"] == "shortcut":
            filters = output_filters[1:][int(module_def["from"])]
            modules.add_module(f"shortcut_{module_i}", EmptyLayer())#创建空的层,先占位置,前向传播再用
        elif module_def["type"] == "yolo":
            anchor_idxs = [int(x) for x in module_def["mask"].split(",")]#拿到当前图对应的先验框的id
            # 拿到实际的先验框的大小Extract anchors
            anchors = [int(x) for x in module_def["anchors"].split(",")]
            anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)]
            anchors = [anchors[i] for i in anchor_idxs]
            num_classes = int(module_def["classes"]) #80个种类
            img_size = int(hyperparams["height"])
            # Define detection layer
            yolo_layer = YOLOLayer(anchors, num_classes, img_size)
            modules.add_module(f"yolo_{module_i}", yolo_layer)
        # Register module list and number of output filters
        module_list.append(modules)#三合一整体加入到module_list
        output_filters.append(filters)#加入到最后输出的特征图个数
    return hyperparams, module_list
  • 当数据来了以后实际如何走
class Darknet(nn.Module):
    """YOLOv3 object detection model"""
    #构造函数,提前写好网络模型用到了哪些模块
    def __init__(self, config_path, img_size=416):
        super(Darknet, self).__init__()
        self.module_defs = parse_model_config(config_path)#读入写好的配置文件
        self.hyperparams, self.module_list = create_modules(self.module_defs)#创建模型,按配置文件的顺序逐层定义好每层的一些参数
        self.yolo_layers = [layer[0] for layer in self.module_list if hasattr(layer[0], "metrics")]
        self.img_size = img_size
        self.seen = 0
        self.header_info = np.array([0, 0, 0, self.seen, 0], dtype=np.int32)
    #当数据来了以后如何实际走
    def forward(self, x, targets=None):
        img_dim = x.shape[2]
        loss = 0
        layer_outputs, yolo_outputs = [], []#当前层输出结果,yolo层输出的结果
        for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
            if module_def["type"] in ["convolutional", "upsample", "maxpool"]:
                x = module(x)#pytorch中现成api,直接调用
            elif module_def["type"] == "route":
                x = torch.cat([layer_outputs[int(layer_i)] for layer_i in module_def["layers"].split(",")], 1)#维数拼接
            elif module_def["type"] == "shortcut":
                layer_i = int(module_def["from"])#layer_i:-3表示与前面第三层做加法操作
                x = layer_outputs[-1] + layer_outputs[layer_i]#数据相加
            elif module_def["type"] == "yolo":
                x, layer_loss = module[0](x, targets, img_dim)#x为前一层的结果,targets包含标签xywh,img_dim:输入图像大小
                loss += layer_loss
                yolo_outputs.append(x)
            layer_outputs.append(x)#当前结果加入输出层
        yolo_outputs = to_cpu(torch.cat(yolo_outputs, 1))
        return yolo_outputs if targets is None else (loss, yolo_outputs)

3.前向传播

(1)yolo层

  • 相对位置得到对应的绝对位置
 def compute_grid_offsets(self, grid_size, cuda=True):
        self.grid_size = grid_size
        g = self.grid_size
        FloatTensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor
        self.stride = self.img_dim / self.grid_size
        # Calculate offsets for each grid
        self.grid_x = torch.arange(g).repeat(g, 1).view([1, 1, g, g]).type(FloatTensor)
        self.grid_y = torch.arange(g).repeat(g, 1).t().view([1, 1, g, g]).type(FloatTensor)
        self.scaled_anchors = FloatTensor([(a_w / self.stride, a_h / self.stride) for a_w, a_h in self.anchors])
        self.anchor_w = self.scaled_anchors[:, 0:1].view((1, self.num_anchors, 1, 1))
        self.anchor_h = self.scaled_anchors[:, 1:2].view((1, self.num_anchors, 1, 1))
  • yolo层前向传播
    def forward(self, x, targets=None, img_dim=None):
        # Tensors for cuda support
        print (x.shape)#[4, 255, 10, 10]  batch,特征图个数,特征图大小hw
        FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
        LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor
        ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor

        self.img_dim = img_dim#输入数据大小不是固定的
        num_samples = x.size(0)#batch,表示一次训练4张图像
        grid_size = x.size(2) #网格大小

        prediction = (
            x.view(num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size)#self.num_anchors:格子中一个点对应3种不同候选框,self.num_classes:80个类别
            .permute(0, 1, 3, 4, 2)
            .contiguous()
        )
        print (prediction.shape)#[4, 3, 10, 10, 85]
        # Get outputs
        x = torch.sigmoid(prediction[..., 0])  # Center x,相对位置
        y = torch.sigmoid(prediction[..., 1])  # Center y
        w = prediction[..., 2]  # Width
        h = prediction[..., 3]  # Height
        pred_conf = torch.sigmoid(prediction[..., 4])  # Conf
        pred_cls = torch.sigmoid(prediction[..., 5:])  # Cls pred.属于当前类别的可能性0-1之间

        # If grid size does not match current we compute new offsets
        if grid_size != self.grid_size:
            self.compute_grid_offsets(grid_size, cuda=x.is_cuda) #相对位置得到对应的绝对位置比如之前的位置是0.5,0.5变为 11.5,11.5这样的

        # Add offset and scale with anchors #特征图中的实际位置
        pred_boxes = FloatTensor(prediction[..., :4].shape)
        pred_boxes[..., 0] = x.data + self.grid_x
        pred_boxes[..., 1] = y.data + self.grid_y
        pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w
        pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h
        output = torch.cat( 
            (
                pred_boxes.view(num_samples, -1, 4) * self.stride, #还原到原始图中
                pred_conf.view(num_samples, -1, 1),
                pred_cls.view(num_samples, -1, self.num_classes),
            ),
            -1,
        )

4.计算损失

  • 损失包括:位置误差,含物体的置信度误差,不含物体的置信度误差,分类误差
def build_targets(pred_boxes, pred_cls, target, anchors, ignore_thres):
    ByteTensor = torch.cuda.ByteTensor if pred_boxes.is_cuda else torch.ByteTensor
    FloatTensor = torch.cuda.FloatTensor if pred_boxes.is_cuda else torch.FloatTensor

    nB = pred_boxes.size(0) # batchsieze 4
    nA = pred_boxes.size(1) # 每个格子对应了多少个anchor
    nC = pred_cls.size(-1)  # 类别的数量80
    nG = pred_boxes.size(2) # gridsize

    # Output tensors损失包括:位置误差,含物体的置信度误差,不含物体的置信度误差,分类误差
    obj_mask = ByteTensor(nB, nA, nG, nG).fill_(0)  # obj,anchor包含物体, 即为1,默认为0 考虑前景
    noobj_mask = ByteTensor(nB, nA, nG, nG).fill_(1) # noobj, anchor不包含物体, 则为1,默认为1 考虑背景
    class_mask = FloatTensor(nB, nA, nG, nG).fill_(0) # 类别掩膜,类别预测正确即为1,默认全为0
    iou_scores = FloatTensor(nB, nA, nG, nG).fill_(0) # 预测框与真实框的iou得分
    tx = FloatTensor(nB, nA, nG, nG).fill_(0) # 真实框相对于网格的位置
    ty = FloatTensor(nB, nA, nG, nG).fill_(0)
    tw = FloatTensor(nB, nA, nG, nG).fill_(0) 
    th = FloatTensor(nB, nA, nG, nG).fill_(0)
    tcls = FloatTensor(nB, nA, nG, nG, nC).fill_(0)#类别

    # 特征图中的实际位置Convert to position relative to box
    target_boxes = target[:, 2:6] * nG #target是指在原始图像的,转换成在特征图中的位置,便于算损失。target中的xywh都是0-1的,可以得到其在当前gridsize上的xywh
    gxy = target_boxes[:, :2]
    gwh = target_boxes[:, 2:]
    # Get anchors with best iou
    ious = torch.stack([bbox_wh_iou(anchor, gwh) for anchor in anchors]) #每一种规格的anchor跟每个标签上的框的IOU得分
    print (ious.shape)#[3, 18],18个真实的框,每一个真实框与每3个候选框计算
    best_ious, best_n = ious.max(0) # 得到其最高分以及哪种规格框和当前目标最相似,best_n:0,1,2表示三种框的规格
    # Separate target values
    b, target_labels = target[:, :2].long().t() # 真实框所对应的batch,以及每个框所代表的实际类别,b当前每个值属于哪个batch
    gx, gy = gxy.t()
    gw, gh = gwh.t()
    gi, gj = gxy.long().t() #位置信息,向下取整了
    # Set masks
    obj_mask[b, best_n, gj, gi] = 1 # 实际包含物体的设置成1
    noobj_mask[b, best_n, gj, gi] = 0 # 相反

    # Set noobj mask to zero where iou exceeds ignore threshold
    for i, anchor_ious in enumerate(ious.t()): # IOU超过了指定的阈值就相当于有物体了
        noobj_mask[b[i], anchor_ious > ignore_thres, gj[i], gi[i]] = 0

    # tx,ty在当前格子中的位置Coordinates
    tx[b, best_n, gj, gi] = gx - gx.floor() # 根据真实框所在位置,得到其相当于网络的位置
    ty[b, best_n, gj, gi] = gy - gy.floor()
    # Width and height
    tw[b, best_n, gj, gi] = torch.log(gw / anchors[best_n][:, 0] + 1e-16)
    th[b, best_n, gj, gi] = torch.log(gh / anchors[best_n][:, 1] + 1e-16)
    # One-hot encoding of label
    tcls[b, best_n, gj, gi, target_labels] = 1 #将真实框的标签转换为one-hot编码形式
    # Compute label correctness and iou at best anchor 计算预测的和真实一样的索引
    class_mask[b, best_n, gj, gi] = (pred_cls[b, best_n, gj, gi].argmax(-1) == target_labels).float()
    iou_scores[b, best_n, gj, gi] = bbox_iou(pred_boxes[b, best_n, gj, gi], target_boxes, x1y1x2y2=False) #与真实框想匹配的预测框之间的iou值

    tconf = obj_mask.float() # 真实框的置信度,也就是1
    return iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf
        if targets is None:
            return output, 0
        else:#计算损失
            iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = build_targets(
                pred_boxes=pred_boxes,#预测的框
                pred_cls=pred_cls,#类别
                target=targets,#标签
                anchors=self.scaled_anchors,
                ignore_thres=self.ignore_thres,
            )
            # iou_scores:真实值与最匹配的anchor的IOU得分值 class_mask:分类正确的索引  obj_mask:目标框所在位置的最好anchor置为1 noobj_mask obj_mask那里置0,还有计算的iou大于阈值的也置0,其他都为1 tx, ty, tw, th, 对应的对于该大小的特征图的xywh目标值也就是我们需要拟合的值 tconf 目标置信度
            # Loss : Mask outputs to ignore non-existing objects (except with conf. loss)
            loss_x = self.mse_loss(x[obj_mask], tx[obj_mask]) # 只计算有目标的
            loss_y = self.mse_loss(y[obj_mask], ty[obj_mask])
            loss_w = self.mse_loss(w[obj_mask], tw[obj_mask])
            loss_h = self.mse_loss(h[obj_mask], th[obj_mask])
            loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask]) #前景置信度损失
            loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask]) #背景置信度损失
            loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj #前景背景合到一起表示置信度损失,有物体越接近1越好 没物体的越接近0越好
            loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask]) #分类损失
            total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls #总损失
            # 各个损失阈值Metrics
            cls_acc = 100 * class_mask[obj_mask].mean()
            conf_obj = pred_conf[obj_mask].mean()
            conf_noobj = pred_conf[noobj_mask].mean()
            conf50 = (pred_conf > 0.5).float()
            iou50 = (iou_scores > 0.5).float()
            iou75 = (iou_scores > 0.75).float()
            detected_mask = conf50 * class_mask * tconf
            precision = torch.sum(iou50 * detected_mask) / (conf50.sum() + 1e-16)
            recall50 = torch.sum(iou50 * detected_mask) / (obj_mask.sum() + 1e-16)
            recall75 = torch.sum(iou75 * detected_mask) / (obj_mask.sum() + 1e-16)
            return output, total_loss

5.反向传播

  • 上文相当于已经执行完loss, outputs = model(imgs, targets)
    for epoch in range(opt.epochs):#数据是训练时一批一批读进来的
        model.train()
        start_time = time.time()
        for batch_i, (_, imgs, targets) in enumerate(dataloader):
            batches_done = len(dataloader) * epoch + batch_i

            imgs = Variable(imgs.to(device))
            targets = Variable(targets.to(device), requires_grad=False)
            print ('imgs',imgs.shape)
            print ('targets',targets.shape)
            loss, outputs = model(imgs, targets)#传入输入数据和标签到模型走前向传播得到损失和输出
            loss.backward()#反向传播

            if batches_done % opt.gradient_accumulations:
                # Accumulates gradient before each step
                optimizer.step()#梯度更新
                optimizer.zero_grad()#梯度置0

6.结果

if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    os.makedirs("output", exist_ok=True)#创建一个生成的文件夹

    # Set up model
    model = Darknet(opt.model_def, img_size=opt.img_size).to(device)#Darknet网络
    #加载训练好的模型
    if opt.weights_path.endswith(".weights"):
        # Load darknet weights
        model.load_darknet_weights(opt.weights_path)
    else:
        # Load checkpoint weights
        model.load_state_dict(torch.load(opt.weights_path))
    model.eval()  # 只有前向传播,不对模型参数进行更新改变Set in evaluation mode
    dataloader = DataLoader(#在哪读数据
        ImageFolder(opt.image_folder, img_size=opt.img_size),
        batch_size=opt.batch_size,
        shuffle=False,
        num_workers=opt.n_cpu,
    )
    classes = load_classes(opt.class_path)  # id对应的名字拿到手Extracts class labels from file
    Tensor = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor
    imgs = []  # Stores image paths
    img_detections = []  # Stores detections for each image index
    print("\nPerforming object detection:")
    prev_time = time.time()
    for batch_i, (img_paths, input_imgs) in enumerate(dataloader):#不断取数据
        # Configure input
        input_imgs = Variable(input_imgs.type(Tensor))#转Tensor格式

        # 执行前向传播Get detections
        with torch.no_grad():
            detections = model(input_imgs)#输入传到model中,得到预测结果
            detections = non_max_suppression(detections, opt.conf_thres, opt.nms_thres)#非极大值抑制

        # Log progress
        current_time = time.time()
        inference_time = datetime.timedelta(seconds=current_time - prev_time)
        prev_time = current_time
        print("\t+ Batch %d, Inference Time: %s" % (batch_i, inference_time))

        # Save image and detections
        imgs.extend(img_paths)
        img_detections.extend(detections)

    # Bounding-box colors
    cmap = plt.get_cmap("tab20b")
    colors = [cmap(i) for i in np.linspace(0, 1, 20)]

    print("\nSaving images:")
    # Iterate through images and save plot of detections
    for img_i, (path, detections) in enumerate(zip(imgs, img_detections)):
        print("(%d) Image: '%s'" % (img_i, path))

        # Create plot
        img = np.array(Image.open(path))
        plt.figure()
        fig, ax = plt.subplots(1)
        ax.imshow(img)

        # Draw bounding boxes and labels of detections
        if detections is not None:
            # Rescale boxes to original image
            detections = rescale_boxes(detections, opt.img_size, img.shape[:2])
            unique_labels = detections[:, -1].cpu().unique()
            n_cls_preds = len(unique_labels)
            bbox_colors = random.sample(colors, n_cls_preds)
            for x1, y1, x2, y2, conf, cls_conf, cls_pred in detections:

                print("\t+ Label: %s, Conf: %.5f" % (classes[int(cls_pred)], cls_conf.item()))

                box_w = x2 - x1
                box_h = y2 - y1

                color = bbox_colors[int(np.where(unique_labels == int(cls_pred))[0])]
                # Create a Rectangle patch
                bbox = patches.Rectangle((x1, y1), box_w, box_h, linewidth=2, edgecolor=color, facecolor="none")
                # Add the bbox to the plot
                ax.add_patch(bbox)
                # Add label
                plt.text(
                    x1,
                    y1,
                    s=classes[int(cls_pred)],
                    color="white",
                    verticalalignment="top",
                    bbox={"color": color, "pad": 0},
                )

        # Save generated image with detections
        plt.axis("off")
        plt.gca().xaxis.set_major_locator(NullLocator())
        plt.gca().yaxis.set_major_locator(NullLocator())
        filename = path.split("\\")[-1].split(".")[0]
        plt.savefig(f"output\{filename}.png", bbox_inches="tight", pad_inches=0.0)
        plt.close()

效果展示: