代码说明
make_txt.py
用来创建含有图片路径与标签文档的程序,但是如果样本中有重名文件( a.jpg和a(2).jpg )需要将第二个删掉,否则深度神经由于无法找到图片而报错。
可以将已生成的txt删除然后重新使用该程序生成,但是要删除掉两个txt文档中含有(2)的行,训练文档大概有16个,测试文档1个
现有的通过该程序生成的两个文档,已经删去了重名的部分,可以正常运行
import os
def make_txt(root):
label_path = os.path.join(root, 'labels')
label_data = os.listdir(label_path)
f = open(root + '\\' + 'data.txt', 'a')
for line1 in label_data:
fh = open(os.path.join(label_path, line1), 'r')
for line2 in fh:
words = line2.split()
img_name = line1.replace('.txt', '.jpg')
f.write(
img_name + ' ' + words[0] + ' ' + words[1] + ' ' + words[2] + ' ' + words[3] + ' ' + words[4] + '\n')
f.close()
path1 = r'.\train'
path2 = r'.\valid'
if os.path.isfile(os.path.join(path1, 'data.txt')):
pass
else:
make_txt(path1)
if os.path.isfile(os.path.join(path2, 'data.txt')):
pass
else:
make_txt(path2)
CNN.py
尝试通过普通卷积神经网络进行,构建成功并且每一步都写了注释,但是最后精度只有9%
解决:学习率0.1过低,跳过了最优解,应该设置为0.01左右。另外两个模型使用0.01可能也会有小幅度的提升
import os.path
import torch
import numpy as np
import torch.nn.functional as F # 激活函数
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image # 用来处理图片
import torch.optim as optim
# 转换方式
transforms = transforms.Compose([
# 将所有图片大小改为224*224
transforms.Resize([168, 168]),
# 将图片转化为tensor类型(可以理解为一个多维数组)
transforms.ToTensor()
])
# 自定义dataset
class MyDataset(Dataset):
# self.属性 代表是这个类的属性,其他方法也可以调用
def __init__(self, path, train=True, transform = None):
self.text_root = path + '\\' + 'data.txt' # 找到data.txt文件
f = open(self.text_root, 'r') # 以只读的方式打开data.txt文件
data = f.readlines() # 将每一行的内容储存到data容器中
imgs = [] # 储存图片路径
labels = [] # 储存图片标签
center_x = [] # 储存图片中心的x坐标
center_y = [] # 储存图片中心的y坐标
length = [] # 储存装甲板长度(一半)
width = [] # 储存装甲板宽度(一半)
name = [] # 储存图片名称
for line in data:
line = line.rstrip() # 去掉该行右边的换行符
word = line.split(" ") # 拆分内容
center_x.append(float(word[2]))
center_y.append(float(word[3]))
length.append(float(word[4])/2)
width.append(float(word[5])/2)
imgs.append(os.path.join(path, 'images', word[0])) # 拼接图片路径
labels.append(word[1])
name.append(word[0])
self.center_x = center_x
self.center_y = center_y
self.length = length
self.width = width
self.img = imgs
self.label = labels
self.transform = transform
self.name = name
def __len__(self):
return len(self.label)
def __getitem__(self, item):
img = self.img[item]
label = self.label[item]
img = Image.open(img).convert('RGB') # 去除透明通道
center_x = self.center_x[item]
center_y = self.center_y[item]
length = self.length[item]
width = self.width[item]
img = img.crop((int((center_x-length)*img.size[0]), int((center_y-width)*img.size[1])
, int((center_x+length)*img.size[0]), int((center_y+width)*img.size[1]))) # 裁剪图片,留下装甲板
# img.save(os.path.join(r'C:\Users\zhenghaokun\Desktop\RM_train_data\pic', self.name[item])) # 将裁剪的图片保存下来(单纯是为了检测是否裁剪成功)
if self.transform is not None: # 对图片进行转化,先改尺寸后变为tenser类型
img = self.transform(img)
label = np.array(label).astype(np.int64)
label = torch.from_numpy(label) # 将数组转化为张量
return img, label
# 导入数据集
path_train = r'.\train'
path_test = r'.\valid'
train_dataset = MyDataset(path_train, train=True, transform=transforms)
train_loader = DataLoader(dataset=train_dataset, batch_size=8, shuffle=True)
test_dataset = MyDataset(path_test, train=False, transform=transforms)
test_loader = DataLoader(dataset=test_dataset, batch_size=8, shuffle=False)
# 模型
class Net(torch.nn.Module):
def __init__(self):
super(Net, self).__init__()
# 定义了第一个卷积层,因为图片输入通道为3(RGB),第一个参数就是3
# kernel_size是卷积核的大小
self.conv1 = torch.nn.Conv2d(3, 10, kernel_size=5)
# 同理
self.conv2 = torch.nn.Conv2d(10, 20, kernel_size=5)
# 再定义一个2x2池化层
self.pooling = torch.nn.MaxPool2d(2)
# 最后是我们做分类用的线性层,第一个数值可以通过程序报错得出来(不要手算)
self.fc = torch.nn.Linear(30420, 12)
# 下面就是计算的过程
def forward(self, x):
batch_size = x.size(0) # 这里面的0是x大小第1个参数,自动获取batch大小
# 输入x经过一个卷积层,之后经历一个池化层,最后用relu做激活
x = F.relu(self.pooling(self.conv1(x)))
# 再经历上面的过程
x = F.relu(self.pooling(self.conv2(x)))
# 把一个二维的图片张量变成一维的
x = x.view(batch_size, -1) # flatten
# 经过线性层,确定他是0~11(一共12种装甲板)每一个数的概率
x = self.fc(x)
return x
model = Net() # 实例化模型
# 把计算迁移到GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
# 定义一个损失函数,来计算我们模型输出的值和标准值的差距,这里用交叉熵函数
criterion = torch.nn.CrossEntropyLoss()
# 定义一个优化器
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5) # lr为学习率
# 训练
def train(epoch):
running_loss = 0.0
for batch_idx, data in enumerate(train_loader, 0): # 每次取一个样本
inputs, target = data
inputs, target = inputs.to(device), target.to(device)
# 优化器清零(重要)
optimizer.zero_grad()
# 正向计算
outputs = model(inputs)
# 计算损失
loss = criterion(outputs, target)
# 反向求梯度
loss.backward()
# 更新权重
optimizer.step()
# 把损失加起来
running_loss += loss.item()
# 每100次输出一下数据
if batch_idx % 100 == 99:
print('[%d, %5d] loss: %.3f' % (epoch + 1, batch_idx + 1, running_loss / 100))
running_loss = 0.0
# 测试
def test():
correct = 0
total = 0
with torch.no_grad(): # 不用算梯度
for data in test_loader:
inputs, target = data
inputs, target = inputs.to(device), target.to(device)
outputs = model(inputs)
# 我们取概率最大的那个数作为输出
_, predicted = torch.max(outputs.data, dim=1)
total += target.size(0)
# 计算正确率
correct += (predicted == target).sum().item()
print('Accuracy on test set: %d %% [%d/%d]' % (100 * correct / total, correct, total))
if __name__ == '__main__':
for epoch in range(10):
train(epoch)
test()
Inception Moudel.py
使用Google Net(可以解决kernel大小的选择难题),最高精度达到过96%,平时一般为93%
import os.path
import torch.nn as nn
import torch
import numpy as np
import torch.nn.functional as F # 激活函数
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image # 用来处理图片
import torch.optim as optim
# 转换方式
transforms = transforms.Compose([
# 将所有图片大小改为224*224
transforms.Resize([168, 168]),
# 将图片转化为tensor类型(可以理解为一个多维数组)
transforms.ToTensor()
])
# 自定义dataset
class MyDataset(Dataset):
# self.属性 代表是这个类的属性,其他方法也可以调用
def __init__(self, path, train=True, transform = None):
self.text_root = path + '\\' + 'data.txt' # 找到data.txt文件
f = open(self.text_root, 'r') # 以只读的方式打开data.txt文件
data = f.readlines() # 将每一行的内容储存到data容器中
imgs = [] # 储存图片路径
labels = [] # 储存图片标签
center_x = [] # 储存图片中心的x坐标
center_y = [] # 储存图片中心的y坐标
length = [] # 储存装甲板长度(一半)
width = [] # 储存装甲板宽度(一半)
name = [] # 储存图片名称
for line in data:
line = line.rstrip() # 去掉该行右边的换行符
word = line.split(" ") # 拆分内容
center_x.append(float(word[2]))
center_y.append(float(word[3]))
length.append(float(word[4])/2)
width.append(float(word[5])/2)
imgs.append(os.path.join(path, 'images', word[0])) # 拼接图片路径
labels.append(word[1])
name.append(word[0])
self.center_x = center_x
self.center_y = center_y
self.length = length
self.width = width
self.img = imgs
self.label = labels
self.transform = transform
self.name = name
def __len__(self):
return len(self.label)
def __getitem__(self, item):
img = self.img[item]
label = self.label[item]
img = Image.open(img).convert('RGB') # 去除透明通道
center_x = self.center_x[item]
center_y = self.center_y[item]
length = self.length[item]
width = self.width[item]
img = img.crop((int((center_x-length)*img.size[0]), int((center_y-width)*img.size[1])
, int((center_x+length)*img.size[0]), int((center_y+width)*img.size[1]))) # 裁剪图片,留下装甲板
# img.save(os.path.join(r'C:\Users\zhenghaokun\Desktop\RM_train_data\pic', self.name[item])) # 将裁剪的图片保存下来(单纯是为了检测是否裁剪成功)
if self.transform is not None: # 对图片进行转化,先改尺寸后变为tenser类型
img = self.transform(img)
label = np.array(label).astype(np.int64)
label = torch.from_numpy(label) # 将数组转化为张量
return img, label
batch_size = 16
# 导入数据集
path_train = r'.\train'
path_test = r'.\valid'
train_dataset = MyDataset(path_train, train=True, transform=transforms)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_dataset = MyDataset(path_test, train=False, transform=transforms)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
# 构造模型
class InceptionA(nn.Module):
def __init__(self, in_channels):
super(InceptionA, self).__init__()
# 1x1分支
self.branch1x1 = nn.Conv2d(in_channels, 16, kernel_size=1)
# 5x5分支
self.branch5x5_1 = nn.Conv2d(in_channels, 16, kernel_size=1)
self.branch5x5_2 = nn.Conv2d(16, 24, kernel_size=5, padding=2)
# 3x3分支
self.branch3x3_1 = nn.Conv2d(in_channels, 16, kernel_size=1)
self.branch3x3_2 = nn.Conv2d(16, 24, kernel_size=3, padding=1)
self.branch3x3_3 = nn.Conv2d(24, 24, kernel_size=3, padding=1)
# 池化分支
self.branch_pool = nn.Conv2d(in_channels, 24, kernel_size=1)
def forward(self, x):
# 1x1分支:直接1x1
branch1x1 = self.branch1x1(x)
# 5x5分支:先1x1,再5x5
branch5x5 = self.branch5x5_1(x)
branch5x5 = self.branch5x5_2(branch5x5)
# 3x3分支:先1x1,再进行两个3x3
branch3x3 = self.branch3x3_1(x)
branch3x3 = self.branch3x3_2(branch3x3)
branch3x3 = self.branch3x3_3(branch3x3)
# 平均池分支:先进行平均池计算,再进行1x1卷积
branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1)
branch_pool = self.branch_pool(branch_pool)
# 把四个分支的结果放到一个数组中,再通过cat方法,沿着通道的维度把他们拼起来
outputs = [branch1x1, branch5x5, branch3x3, branch_pool]
return torch.cat(outputs, dim=1) # b,c,w,h c对应的是dim=1
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 10, kernel_size=5)
self.conv2 = nn.Conv2d(88, 20, kernel_size=5) # 88 = 24x3 + 16(1x1分支输出16通道,其他都输出24通道)
self.incep1 = InceptionA(in_channels=10) # 与conv1 中的10对应
self.incep2 = InceptionA(in_channels=20) # 与conv2 中的20对应
# 最大池化与全连接
self.mp = nn.MaxPool2d(2)
self.fc = nn.Linear(133848, 12)
def forward(self, x):
in_size = x.size(0)
# 卷积+池化+relu激活
x = F.relu(self.mp(self.conv1(x))) # 3(RGB)->10
x = self.incep1(x) # 10->88
# 同理
x = F.relu(self.mp(self.conv2(x))) # 88->20
x = self.incep2(x) # 20->88
x = x.view(in_size, -1)
x = self.fc(x)
return x
model = Net()
# 构造损失函数与优化器
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
# 训练
def train(epoch):
running_loss = 0.0
for batch_idx, data in enumerate(train_loader, 0):
inputs, target = data
inputs, target = inputs.to(device), target.to(device)
# 优化器清零
optimizer.zero_grad()
# 正向计算
outputs = model(inputs)
# 计算损失
loss = criterion(outputs, target)
# 反向传播求梯度
loss.backward()
# 更新权重
optimizer.step()
# 损失累加
running_loss += loss.item()
if batch_idx % 300 == 299:
print('[%d, %5d] loss: %.3f' % (epoch + 1, batch_idx + 1, running_loss / 300))
# 损失清零
running_loss = 0.0
# 测试
def test():
correct = 0
total = 0
with torch.no_grad():
for data in test_loader:
images, labels = data
images, labels = images.to(device), labels.to(device)
outputs = model(images)
_, predicted = torch.max(outputs.data, dim=1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print('Accuracy on test set: %d %% [%d/%d]' % (100 * correct / total, correct, total))
if __name__ == '__main__':
for epoch in range(10):
train(epoch)
test()
residualBlock.py
使用跳连接解决梯度消失,使前一部分样本的训练更有效。精度是三者中最高的
import torch
import torch.nn as nn
import numpy as np
import os.path
from PIL import Image
from torchvision import transforms
from torch.utils.data import DataLoader,Dataset
import torch.nn.functional as F
import torch.optim as optim
transforms = transforms.Compose([
# 将所有图片大小改为224*224
transforms.Resize([168, 168]),
# 将图片转化为tensor类型(可以理解为一个多维数组)
transforms.ToTensor()
])
# 自定义dataset
class MyDataset(Dataset):
# self.属性 代表是这个类的属性,其他方法也可以调用
def __init__(self, path, train=True, transform = None):
self.text_root = path + '\\' + 'data.txt' # 找到data.txt文件
f = open(self.text_root, 'r') # 以只读的方式打开data.txt文件
data = f.readlines() # 将每一行的内容储存到data容器中
imgs = [] # 储存图片路径
labels = [] # 储存图片标签
center_x = [] # 储存图片中心的x坐标
center_y = [] # 储存图片中心的y坐标
length = [] # 储存装甲板长度(一半)
width = [] # 储存装甲板宽度(一半)
name = [] # 储存图片名称
for line in data:
line = line.rstrip() # 去掉该行右边的换行符
word = line.split(" ") # 拆分内容
center_x.append(float(word[2]))
center_y.append(float(word[3]))
length.append(float(word[4])/2)
width.append(float(word[5])/2)
imgs.append(os.path.join(path, 'images', word[0])) # 拼接图片路径
labels.append(word[1])
name.append(word[0])
self.center_x = center_x
self.center_y = center_y
self.length = length
self.width = width
self.img = imgs
self.label = labels
self.transform = transform
self.name = name
def __len__(self):
return len(self.label)
def __getitem__(self, item):
img = self.img[item]
label = self.label[item]
img = Image.open(img).convert('RGB') # 去除透明通道
center_x = self.center_x[item]
center_y = self.center_y[item]
length = self.length[item]
width = self.width[item]
img = img.crop((int((center_x-length)*img.size[0]), int((center_y-width)*img.size[1])
, int((center_x+length)*img.size[0]), int((center_y+width)*img.size[1]))) # 裁剪图片,留下装甲板
# img.save(os.path.join(r'C:\Users\zhenghaokun\Desktop\RM_train_data\pic', self.name[item])) # 将裁剪的图片保存下来(单纯是为了检测是否裁剪成功)
if self.transform is not None: # 对图片进行转化,先改尺寸后变为tenser类型
img = self.transform(img)
label = np.array(label).astype(np.int64)
label = torch.from_numpy(label) # 将数组转化为张量
return img, label
batch_size = 32
# 导入数据集
path_train = r'.\train'
path_test = r'.\valid'
train_dataset = MyDataset(path_train, train=True, transform=transforms)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_dataset = MyDataset(path_test, train=False, transform=transforms)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
# design model using class
class ResidualBlock(nn.Module):
def __init__(self, channels):
super(ResidualBlock, self).__init__()
self.channels = channels
self.conv1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
self.conv2 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
def forward(self, x):
y = F.relu(self.conv1(x))
y = self.conv2(y)
return F.relu(x + y)
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 16, kernel_size=5)
self.conv2 = nn.Conv2d(16, 32, kernel_size=5) # 88 = 24x3 + 16
self.rblock1 = ResidualBlock(16)
self.rblock2 = ResidualBlock(32)
self.mp = nn.MaxPool2d(2)
self.fc = nn.Linear(48672, 12) # 暂时不知道1408咋能自动出来的
def forward(self, x):
in_size = x.size(0)
x = self.mp(F.relu(self.conv1(x)))
x = self.rblock1(x)
x = self.mp(F.relu(self.conv2(x)))
x = self.rblock2(x)
x = x.view(in_size, -1)
x = self.fc(x)
return x
model = Net()
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
def train(epoch):
running_loss = 0.0
for batch_idx, data in enumerate(train_loader, 0):
inputs, target = data
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, target)
loss.backward()
optimizer.step()
running_loss += loss.item()
if batch_idx % 300 == 299:
print('[%d, %5d] loss: %.3f' % (epoch + 1, batch_idx + 1, running_loss / 300))
running_loss = 0.0
def test():
correct = 0
total = 0
with torch.no_grad():
for data in test_loader:
images, labels = data
outputs = model(images)
_, predicted = torch.max(outputs.data, dim=1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print('accuracy on test set: %d %% ' % (100 * correct / total))
if __name__ == '__main__':
for epoch in range(10):
train(epoch)
test()
神经网络架构搭建流程(CNN)
1. 导入数据集
使用torchvision提供的数据集
通过torchvision的datasets可以获得MNIST等已有的数据集,可以通过参数进行下载
使用自定义的数据集
使用ImageFolder需要将文件放到特定位置,使用pytorch提供的Dataset类可以有更高的自由度
- 通过程序创建一个txt文件,这个文件的每一行包含文件名、类型、四个关于坐标的参数
- 在自定义dateset中读取这个txt文件,在__getitem__方法中进行图片的裁剪
灵感来源:
- (13条消息) pytorch加载自己的图片数据集的两种方法__-周-_的博客-CSDN博客_pytorch加载自己的数据集
- (13条消息) 有趣的深度学习2——利用pytorch对数据集进行预处理_树杰同学的博客-CSDN博客_pytorch三维数据预处理
- (13条消息) PyTorch加载数据集_pytorch 数据集_心️升明月的博客-CSDN博客
图片裁剪:Pillow - 裁剪图像 - 蝴蝶教程 (jc2182.com)
2. 构造模型
卷积-池化-激活,最后进行线性
注意:
-
最后一层不做激活,因为交叉熵损失中自带激活
-
最后全连接层的线形层转换,Linear的第一个参数不要手算,可以通过程序报错得到数据(也可以通过代码输出size进行乘法)例子:(13条消息) RuntimeError: mat1 and mat2 shapes cannot be multiplied (5760x6 and 128x4)_多吃维C_的博客-CSDN博客_mat1 and mat2 shapes cannot be multiplied
-
forward中直接通过一个参数进行迭代,不要每一层都设置一个参数,容易搞混
-
最好通过torch.device将模型迁移到GPU中
3. 构造损失函数与优化器
一般损失函数使用交叉熵函数
优化器这里使用SGD
4. 进行训练循环
整体过程为:正向计算得到损失,反向传播求出梯度
注意
-
对应的输入输出也可以迁移到GPU中,并且要和模型在同一块显卡上
-
每轮都要进行优化器清零,更新权重
-
防止过拟合,要保证模型具有泛化的能力
神经网络的结构
注释
优化方法
- 更改激活函数
- 加大样本量
- 加深神经网络的复杂度,增加层数(同时防止梯度消失、过拟合)