目录
Yolov3取消池化和全连接层,全部由53个卷积层组成,又名Darknet53,采用多scale,每个scale包含三种候选框,对不同的特征图进行融合后再预测(感受野大的上采样后与感受野相对较小的融合)。利用coco数据集对模型进行训练,最后返回物体所在位置以及物体的类别(回归和分类预测)。
1.数据预处理
- 数据是训练时一批一批读进来的,是在传入模型的时候再做处理的,不是一开始就把数据全部处理好的。
- 首先拿到实际的图像数据并转换成tensor格式,其次,根据输入进来的图片大小把它填充成正方形;同时读入图片对应的标签文件(标签数据就是框的位置),把标签数据转换成tensor格式后把当前的坐标变成做完填充后的坐标,最后,把x1y1x2y2转换成xywh(框的左上、右下角坐标转换成中心点坐标加wh),并对图像做增强。
def __getitem__(self, index):#读入数据和标签
# ---------
# Image
# ---------
img_path = self.img_files[index % len(self.img_files)].rstrip()
img_path = 'C:\\Users\\lus\\AA-project\\pytorch\\PyTorch-YOLOv3\\data\\coco' + img_path #拿到实际图像的数据
#print (img_path)
# Extract image as PyTorch tensor
img = transforms.ToTensor()(Image.open(img_path).convert('RGB'))#数据转换成tensor格式
# 数据预处理Handle images with less than three channels
if len(img.shape) != 3:
img = img.unsqueeze(0)
img = img.expand((3, img.shape[1:]))
_, h, w = img.shape #输入进来图片大小 _:3 h:375 w:500
h_factor, w_factor = (h, w) if self.normalized_labels else (1, 1)
# 做填充使输入进来的长方形图片变成正方形Pad to square resolution
img, pad = pad_to_square(img, 0)
_, padded_h, padded_w = img.shape # padded_h:500 padded_w:500
# ---------
# Label
# ---------
#处理标签,标签是对应图片的
label_path = self.label_files[index % len(self.img_files)].rstrip()
label_path = 'C:\\Users\\lus\\AA-project\\pytorch\\PyTorch-YOLOv3\\data\\coco\\labels' + label_path
#print (label_path)
targets = None
if os.path.exists(label_path):#标签数据就是框的位置
boxes = torch.from_numpy(np.loadtxt(label_path).reshape(-1, 5))#将标签数据转换成tensor格式,第一个位置的数字表示物体对应的索引,其余表示:xywh
# 把当前的坐标变成做完padding后的坐标Extract coordinates for unpadded + unscaled image
x1 = w_factor * (boxes[:, 1] - boxes[:, 3] / 2)#当前坐标
y1 = h_factor * (boxes[:, 2] - boxes[:, 4] / 2)
x2 = w_factor * (boxes[:, 1] + boxes[:, 3] / 2)
y2 = h_factor * (boxes[:, 2] + boxes[:, 4] / 2)
# Adjust for added padding
x1 += pad[0]#padding后的坐标
y1 += pad[2]
x2 += pad[1]
y2 += pad[3]
# 把x1y1x2y2转换成xywh Returns (x, y, w, h)
boxes[:, 1] = ((x1 + x2) / 2) / padded_w#中心点的x
boxes[:, 2] = ((y1 + y2) / 2) / padded_h#中心点的y
boxes[:, 3] *= w_factor / padded_w #w
boxes[:, 4] *= h_factor / padded_h #h
targets = torch.zeros((len(boxes), 6))
targets[:, 1:] = boxes
# 图像增强Apply augmentations
if self.augment:
if np.random.random() < 0.5:
img, targets = horisontal_flip(img, targets)
return img_path, img, targets #返回图像数据和标签
2.构建网络结构
- 创建模型,按配置文件的顺序逐层定义好每层的一些参数
def create_modules(module_defs):
"""
Constructs module list of layer blocks from module configuration in module_defs
"""
hyperparams = module_defs.pop(0)#标记一下配置文件中刚开始的数据是一些超参数
output_filters = [int(hyperparams["channels"])]
module_list = nn.ModuleList()#按顺序往里搭模块
for module_i, module_def in enumerate(module_defs):#遍历配置文件中的每一块(一块包括:卷积+BN+RELu)
modules = nn.Sequential()
if module_def["type"] == "convolutional":#判断每一模块的type
bn = int(module_def["batch_normalize"])#加BN
filters = int(module_def["filters"])
kernel_size = int(module_def["size"])
pad = (kernel_size - 1) // 2
modules.add_module(
f"conv_{module_i}",
nn.Conv2d(
in_channels=output_filters[-1],#输入通道数,输出特征图数
out_channels=filters,
kernel_size=kernel_size,
stride=int(module_def["stride"]),
padding=pad,
bias=not bn,#加了BN暂不考虑偏置项
),
)
if bn:#判断每一模块的type
modules.add_module(f"batch_norm_{module_i}", nn.BatchNorm2d(filters, momentum=0.9, eps=1e-5))
if module_def["activation"] == "leaky":#判断每一模块的type
modules.add_module(f"leaky_{module_i}", nn.LeakyReLU(0.1))
elif module_def["type"] == "maxpool":
kernel_size = int(module_def["size"])
stride = int(module_def["stride"])
if kernel_size == 2 and stride == 1:
modules.add_module(f"_debug_padding_{module_i}", nn.ZeroPad2d((0, 1, 0, 1)))
maxpool = nn.MaxPool2d(kernel_size=kernel_size, stride=stride, padding=int((kernel_size - 1) // 2))
modules.add_module(f"maxpool_{module_i}", maxpool)
elif module_def["type"] == "upsample":
upsample = Upsample(scale_factor=int(module_def["stride"]), mode="nearest")#定义上采样层
modules.add_module(f"upsample_{module_i}", upsample)
#route层完成维度的拼接操作,感受野较大的做上采样与感受野较小的拼接上(老的回过头指导年轻的)
elif module_def["type"] == "route": # 输入1:26*26*256 输入2:26*26*128 输出:26*26*(256+128)
layers = [int(x) for x in module_def["layers"].split(",")]
filters = sum([output_filters[1:][i] for i in layers])
modules.add_module(f"route_{module_i}", EmptyLayer())#创建空的层,先占位置,前向传播再用
#shortcut层是数值的相加,配置文件中写的与上面第三层做残差连接,如果中间学的不好,直接上面的第三次与现在的数值相加
elif module_def["type"] == "shortcut":
filters = output_filters[1:][int(module_def["from"])]
modules.add_module(f"shortcut_{module_i}", EmptyLayer())#创建空的层,先占位置,前向传播再用
elif module_def["type"] == "yolo":
anchor_idxs = [int(x) for x in module_def["mask"].split(",")]#拿到当前图对应的先验框的id
# 拿到实际的先验框的大小Extract anchors
anchors = [int(x) for x in module_def["anchors"].split(",")]
anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)]
anchors = [anchors[i] for i in anchor_idxs]
num_classes = int(module_def["classes"]) #80个种类
img_size = int(hyperparams["height"])
# Define detection layer
yolo_layer = YOLOLayer(anchors, num_classes, img_size)
modules.add_module(f"yolo_{module_i}", yolo_layer)
# Register module list and number of output filters
module_list.append(modules)#三合一整体加入到module_list
output_filters.append(filters)#加入到最后输出的特征图个数
return hyperparams, module_list
- 当数据来了以后实际如何走
class Darknet(nn.Module):
"""YOLOv3 object detection model"""
#构造函数,提前写好网络模型用到了哪些模块
def __init__(self, config_path, img_size=416):
super(Darknet, self).__init__()
self.module_defs = parse_model_config(config_path)#读入写好的配置文件
self.hyperparams, self.module_list = create_modules(self.module_defs)#创建模型,按配置文件的顺序逐层定义好每层的一些参数
self.yolo_layers = [layer[0] for layer in self.module_list if hasattr(layer[0], "metrics")]
self.img_size = img_size
self.seen = 0
self.header_info = np.array([0, 0, 0, self.seen, 0], dtype=np.int32)
#当数据来了以后如何实际走
def forward(self, x, targets=None):
img_dim = x.shape[2]
loss = 0
layer_outputs, yolo_outputs = [], []#当前层输出结果,yolo层输出的结果
for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
if module_def["type"] in ["convolutional", "upsample", "maxpool"]:
x = module(x)#pytorch中现成api,直接调用
elif module_def["type"] == "route":
x = torch.cat([layer_outputs[int(layer_i)] for layer_i in module_def["layers"].split(",")], 1)#维数拼接
elif module_def["type"] == "shortcut":
layer_i = int(module_def["from"])#layer_i:-3表示与前面第三层做加法操作
x = layer_outputs[-1] + layer_outputs[layer_i]#数据相加
elif module_def["type"] == "yolo":
x, layer_loss = module[0](x, targets, img_dim)#x为前一层的结果,targets包含标签xywh,img_dim:输入图像大小
loss += layer_loss
yolo_outputs.append(x)
layer_outputs.append(x)#当前结果加入输出层
yolo_outputs = to_cpu(torch.cat(yolo_outputs, 1))
return yolo_outputs if targets is None else (loss, yolo_outputs)
3.前向传播
(1)yolo层
- 相对位置得到对应的绝对位置
def compute_grid_offsets(self, grid_size, cuda=True):
self.grid_size = grid_size
g = self.grid_size
FloatTensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor
self.stride = self.img_dim / self.grid_size
# Calculate offsets for each grid
self.grid_x = torch.arange(g).repeat(g, 1).view([1, 1, g, g]).type(FloatTensor)
self.grid_y = torch.arange(g).repeat(g, 1).t().view([1, 1, g, g]).type(FloatTensor)
self.scaled_anchors = FloatTensor([(a_w / self.stride, a_h / self.stride) for a_w, a_h in self.anchors])
self.anchor_w = self.scaled_anchors[:, 0:1].view((1, self.num_anchors, 1, 1))
self.anchor_h = self.scaled_anchors[:, 1:2].view((1, self.num_anchors, 1, 1))
- yolo层前向传播
def forward(self, x, targets=None, img_dim=None):
# Tensors for cuda support
print (x.shape)#[4, 255, 10, 10] batch,特征图个数,特征图大小hw
FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor
self.img_dim = img_dim#输入数据大小不是固定的
num_samples = x.size(0)#batch,表示一次训练4张图像
grid_size = x.size(2) #网格大小
prediction = (
x.view(num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size)#self.num_anchors:格子中一个点对应3种不同候选框,self.num_classes:80个类别
.permute(0, 1, 3, 4, 2)
.contiguous()
)
print (prediction.shape)#[4, 3, 10, 10, 85]
# Get outputs
x = torch.sigmoid(prediction[..., 0]) # Center x,相对位置
y = torch.sigmoid(prediction[..., 1]) # Center y
w = prediction[..., 2] # Width
h = prediction[..., 3] # Height
pred_conf = torch.sigmoid(prediction[..., 4]) # Conf
pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred.属于当前类别的可能性0-1之间
# If grid size does not match current we compute new offsets
if grid_size != self.grid_size:
self.compute_grid_offsets(grid_size, cuda=x.is_cuda) #相对位置得到对应的绝对位置比如之前的位置是0.5,0.5变为 11.5,11.5这样的
# Add offset and scale with anchors #特征图中的实际位置
pred_boxes = FloatTensor(prediction[..., :4].shape)
pred_boxes[..., 0] = x.data + self.grid_x
pred_boxes[..., 1] = y.data + self.grid_y
pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w
pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h
output = torch.cat(
(
pred_boxes.view(num_samples, -1, 4) * self.stride, #还原到原始图中
pred_conf.view(num_samples, -1, 1),
pred_cls.view(num_samples, -1, self.num_classes),
),
-1,
)
4.计算损失
- 损失包括:位置误差,含物体的置信度误差,不含物体的置信度误差,分类误差
def build_targets(pred_boxes, pred_cls, target, anchors, ignore_thres):
ByteTensor = torch.cuda.ByteTensor if pred_boxes.is_cuda else torch.ByteTensor
FloatTensor = torch.cuda.FloatTensor if pred_boxes.is_cuda else torch.FloatTensor
nB = pred_boxes.size(0) # batchsieze 4
nA = pred_boxes.size(1) # 每个格子对应了多少个anchor
nC = pred_cls.size(-1) # 类别的数量80
nG = pred_boxes.size(2) # gridsize
# Output tensors损失包括:位置误差,含物体的置信度误差,不含物体的置信度误差,分类误差
obj_mask = ByteTensor(nB, nA, nG, nG).fill_(0) # obj,anchor包含物体, 即为1,默认为0 考虑前景
noobj_mask = ByteTensor(nB, nA, nG, nG).fill_(1) # noobj, anchor不包含物体, 则为1,默认为1 考虑背景
class_mask = FloatTensor(nB, nA, nG, nG).fill_(0) # 类别掩膜,类别预测正确即为1,默认全为0
iou_scores = FloatTensor(nB, nA, nG, nG).fill_(0) # 预测框与真实框的iou得分
tx = FloatTensor(nB, nA, nG, nG).fill_(0) # 真实框相对于网格的位置
ty = FloatTensor(nB, nA, nG, nG).fill_(0)
tw = FloatTensor(nB, nA, nG, nG).fill_(0)
th = FloatTensor(nB, nA, nG, nG).fill_(0)
tcls = FloatTensor(nB, nA, nG, nG, nC).fill_(0)#类别
# 特征图中的实际位置Convert to position relative to box
target_boxes = target[:, 2:6] * nG #target是指在原始图像的,转换成在特征图中的位置,便于算损失。target中的xywh都是0-1的,可以得到其在当前gridsize上的xywh
gxy = target_boxes[:, :2]
gwh = target_boxes[:, 2:]
# Get anchors with best iou
ious = torch.stack([bbox_wh_iou(anchor, gwh) for anchor in anchors]) #每一种规格的anchor跟每个标签上的框的IOU得分
print (ious.shape)#[3, 18],18个真实的框,每一个真实框与每3个候选框计算
best_ious, best_n = ious.max(0) # 得到其最高分以及哪种规格框和当前目标最相似,best_n:0,1,2表示三种框的规格
# Separate target values
b, target_labels = target[:, :2].long().t() # 真实框所对应的batch,以及每个框所代表的实际类别,b当前每个值属于哪个batch
gx, gy = gxy.t()
gw, gh = gwh.t()
gi, gj = gxy.long().t() #位置信息,向下取整了
# Set masks
obj_mask[b, best_n, gj, gi] = 1 # 实际包含物体的设置成1
noobj_mask[b, best_n, gj, gi] = 0 # 相反
# Set noobj mask to zero where iou exceeds ignore threshold
for i, anchor_ious in enumerate(ious.t()): # IOU超过了指定的阈值就相当于有物体了
noobj_mask[b[i], anchor_ious > ignore_thres, gj[i], gi[i]] = 0
# tx,ty在当前格子中的位置Coordinates
tx[b, best_n, gj, gi] = gx - gx.floor() # 根据真实框所在位置,得到其相当于网络的位置
ty[b, best_n, gj, gi] = gy - gy.floor()
# Width and height
tw[b, best_n, gj, gi] = torch.log(gw / anchors[best_n][:, 0] + 1e-16)
th[b, best_n, gj, gi] = torch.log(gh / anchors[best_n][:, 1] + 1e-16)
# One-hot encoding of label
tcls[b, best_n, gj, gi, target_labels] = 1 #将真实框的标签转换为one-hot编码形式
# Compute label correctness and iou at best anchor 计算预测的和真实一样的索引
class_mask[b, best_n, gj, gi] = (pred_cls[b, best_n, gj, gi].argmax(-1) == target_labels).float()
iou_scores[b, best_n, gj, gi] = bbox_iou(pred_boxes[b, best_n, gj, gi], target_boxes, x1y1x2y2=False) #与真实框想匹配的预测框之间的iou值
tconf = obj_mask.float() # 真实框的置信度,也就是1
return iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf
if targets is None:
return output, 0
else:#计算损失
iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = build_targets(
pred_boxes=pred_boxes,#预测的框
pred_cls=pred_cls,#类别
target=targets,#标签
anchors=self.scaled_anchors,
ignore_thres=self.ignore_thres,
)
# iou_scores:真实值与最匹配的anchor的IOU得分值 class_mask:分类正确的索引 obj_mask:目标框所在位置的最好anchor置为1 noobj_mask obj_mask那里置0,还有计算的iou大于阈值的也置0,其他都为1 tx, ty, tw, th, 对应的对于该大小的特征图的xywh目标值也就是我们需要拟合的值 tconf 目标置信度
# Loss : Mask outputs to ignore non-existing objects (except with conf. loss)
loss_x = self.mse_loss(x[obj_mask], tx[obj_mask]) # 只计算有目标的
loss_y = self.mse_loss(y[obj_mask], ty[obj_mask])
loss_w = self.mse_loss(w[obj_mask], tw[obj_mask])
loss_h = self.mse_loss(h[obj_mask], th[obj_mask])
loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask]) #前景置信度损失
loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask]) #背景置信度损失
loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj #前景背景合到一起表示置信度损失,有物体越接近1越好 没物体的越接近0越好
loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask]) #分类损失
total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls #总损失
# 各个损失阈值Metrics
cls_acc = 100 * class_mask[obj_mask].mean()
conf_obj = pred_conf[obj_mask].mean()
conf_noobj = pred_conf[noobj_mask].mean()
conf50 = (pred_conf > 0.5).float()
iou50 = (iou_scores > 0.5).float()
iou75 = (iou_scores > 0.75).float()
detected_mask = conf50 * class_mask * tconf
precision = torch.sum(iou50 * detected_mask) / (conf50.sum() + 1e-16)
recall50 = torch.sum(iou50 * detected_mask) / (obj_mask.sum() + 1e-16)
recall75 = torch.sum(iou75 * detected_mask) / (obj_mask.sum() + 1e-16)
return output, total_loss
5.反向传播
- 上文相当于已经执行完loss, outputs = model(imgs, targets)
for epoch in range(opt.epochs):#数据是训练时一批一批读进来的
model.train()
start_time = time.time()
for batch_i, (_, imgs, targets) in enumerate(dataloader):
batches_done = len(dataloader) * epoch + batch_i
imgs = Variable(imgs.to(device))
targets = Variable(targets.to(device), requires_grad=False)
print ('imgs',imgs.shape)
print ('targets',targets.shape)
loss, outputs = model(imgs, targets)#传入输入数据和标签到模型走前向传播得到损失和输出
loss.backward()#反向传播
if batches_done % opt.gradient_accumulations:
# Accumulates gradient before each step
optimizer.step()#梯度更新
optimizer.zero_grad()#梯度置0
6.结果
if __name__ == "__main__":
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
os.makedirs("output", exist_ok=True)#创建一个生成的文件夹
# Set up model
model = Darknet(opt.model_def, img_size=opt.img_size).to(device)#Darknet网络
#加载训练好的模型
if opt.weights_path.endswith(".weights"):
# Load darknet weights
model.load_darknet_weights(opt.weights_path)
else:
# Load checkpoint weights
model.load_state_dict(torch.load(opt.weights_path))
model.eval() # 只有前向传播,不对模型参数进行更新改变Set in evaluation mode
dataloader = DataLoader(#在哪读数据
ImageFolder(opt.image_folder, img_size=opt.img_size),
batch_size=opt.batch_size,
shuffle=False,
num_workers=opt.n_cpu,
)
classes = load_classes(opt.class_path) # id对应的名字拿到手Extracts class labels from file
Tensor = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor
imgs = [] # Stores image paths
img_detections = [] # Stores detections for each image index
print("\nPerforming object detection:")
prev_time = time.time()
for batch_i, (img_paths, input_imgs) in enumerate(dataloader):#不断取数据
# Configure input
input_imgs = Variable(input_imgs.type(Tensor))#转Tensor格式
# 执行前向传播Get detections
with torch.no_grad():
detections = model(input_imgs)#输入传到model中,得到预测结果
detections = non_max_suppression(detections, opt.conf_thres, opt.nms_thres)#非极大值抑制
# Log progress
current_time = time.time()
inference_time = datetime.timedelta(seconds=current_time - prev_time)
prev_time = current_time
print("\t+ Batch %d, Inference Time: %s" % (batch_i, inference_time))
# Save image and detections
imgs.extend(img_paths)
img_detections.extend(detections)
# Bounding-box colors
cmap = plt.get_cmap("tab20b")
colors = [cmap(i) for i in np.linspace(0, 1, 20)]
print("\nSaving images:")
# Iterate through images and save plot of detections
for img_i, (path, detections) in enumerate(zip(imgs, img_detections)):
print("(%d) Image: '%s'" % (img_i, path))
# Create plot
img = np.array(Image.open(path))
plt.figure()
fig, ax = plt.subplots(1)
ax.imshow(img)
# Draw bounding boxes and labels of detections
if detections is not None:
# Rescale boxes to original image
detections = rescale_boxes(detections, opt.img_size, img.shape[:2])
unique_labels = detections[:, -1].cpu().unique()
n_cls_preds = len(unique_labels)
bbox_colors = random.sample(colors, n_cls_preds)
for x1, y1, x2, y2, conf, cls_conf, cls_pred in detections:
print("\t+ Label: %s, Conf: %.5f" % (classes[int(cls_pred)], cls_conf.item()))
box_w = x2 - x1
box_h = y2 - y1
color = bbox_colors[int(np.where(unique_labels == int(cls_pred))[0])]
# Create a Rectangle patch
bbox = patches.Rectangle((x1, y1), box_w, box_h, linewidth=2, edgecolor=color, facecolor="none")
# Add the bbox to the plot
ax.add_patch(bbox)
# Add label
plt.text(
x1,
y1,
s=classes[int(cls_pred)],
color="white",
verticalalignment="top",
bbox={"color": color, "pad": 0},
)
# Save generated image with detections
plt.axis("off")
plt.gca().xaxis.set_major_locator(NullLocator())
plt.gca().yaxis.set_major_locator(NullLocator())
filename = path.split("\\")[-1].split(".")[0]
plt.savefig(f"output\{filename}.png", bbox_inches="tight", pad_inches=0.0)
plt.close()
效果展示: