1.图像分割目前存在的问题
现在的deep CNN能提取图像中很多feature map,比如图像的纹理、颜色、形状特征,但对于分割任务来说这并不是我们全部需要的,分割任务最理想的是根据边界和形状信息进行识别,如果信息流中包含了很多颜色、纹理可能会导致识别问题。
2.该网络提出的解决方法
针对上述问题,作者提出了双流CNN(two-stream CNN)结构,也就是将shape stream单独分离出来,与常规的CNN进行并行操作,最后将两者学习到的特征通过ASPP进行融合处理,从而提高了语义分割的性能。
3.整体网络结构
End-to-End Referring Video Object Segmentation with Multimodal Transformers 的网络结构主要包括两个部分:指代表达理解模块和视频对象分割模块。 <br> 指代表达理解模块:
指代表达理解模块主要由两个部分组成:自然语言处理模块和视觉特征提取模块。其中,自然语言处理模块使用 BERT 模型来对自然语言指令进行编码,生成指令向量。视觉特征提取模块则使用 ResNet-101 网络来提取视频帧的视觉特征。
这两个模块的输出会被传递到多模态 Transformer 模型中,进行融合和编码。多模态 Transformer 模型是一个基于自注意力机制的神经网络,能够同时处理文本和图像信息并生成相应的特征向量。 <br> 视频对象分割模块:
视频对象分割模块主要由两个部分组成:空间掩模生成模块和掩模优化模块。空间掩模生成模块使用多头注意力机制来生成初始的空间掩模,表示视频中目标对象的位置和形状。掩模优化模块则使用迭代的方式来对空间掩模进行优化,以提高分割的准确性。
整个网络结构采用端到端的训练方式,可以直接从自然语言指令和视频帧中学习到对象分割的知识。同时,多模态 Transformer 模型的引入使得网络能够处理文本和图像的复杂关系,从而提高了分割的精度和鲁棒性。
4.测试结果如下:
5.损失函数设计
本次是多任务学习(边缘检测指导语义分割),total function由四部分组成:两个任务的loss function和两个任务的正则化项。
loss function 其中: 试验细节:
- batch_size=16,
- lr=0.01多项式衰减
- 损失参数:20,1,1,1
- resolution:800x800
代码
import torchimport torch.nn.functional as Ffrom torch import nnfrom network import SEresnextfrom network import Resnetfrom network.wider_resnet import wider_resnet38_a2from config import cfgfrom network.mynn import initialize_weights, Norm2dfrom torch.autograd import Variable
from my_functionals import GatedSpatialConv as gsc
import cv2import numpy as np
class Crop(nn.Module):
def __init__(self, axis, offset):
super(Crop, self).__init__()
self.axis = axis
self.offset = offset
def forward(self, x, ref):
""" :param x: input layer :param ref: reference usually data in :return: """
for axis in range(self.axis, x.dim()):
ref_size = ref.size(axis)
indices = torch.arange(self.offset, self.offset + ref_size).long()
indices = x.data.new().resize_(indices.size()).copy_(indices).long()
x = x.index_select(axis, Variable(indices))
return x
class MyIdentity(nn.Module):
def __init__(self, axis, offset):
super(MyIdentity, self).__init__()
self.axis = axis
self.offset = offset
def forward(self, x, ref):
""" :param x: input layer :param ref: reference usually data in :return: """
return x
class SideOutputCrop(nn.Module):
""" This is the original implementation ConvTranspose2d (fixed) and crops """
def __init__(self, num_output, kernel_sz=None, stride=None, upconv_pad=0, do_crops=True):
super(SideOutputCrop, self).__init__()
self._do_crops = do_crops
self.conv = nn.Conv2d(num_output, out_channels=1, kernel_size=1, stride=1, padding=0, bias=True)
if kernel_sz is not None:
self.upsample = True
self.upsampled = nn.ConvTranspose2d(1, out_channels=1, kernel_size=kernel_sz, stride=stride,
padding=upconv_pad,
bias=False)
##doing crops
if self._do_crops:
self.crops = Crop(2, offset=kernel_sz // 4)
else:
self.crops = MyIdentity(None, None)
else:
self.upsample = False
def forward(self, res, reference=None):
side_output = self.conv(res)
if self.upsample:
side_output = self.upsampled(side_output)
side_output = self.crops(side_output, reference)
return side_output
class _AtrousSpatialPyramidPoolingModule(nn.Module):
''' operations performed: 1x1 x depth 3x3 x depth dilation 6 3x3 x depth dilation 12 3x3 x depth dilation 18 image pooling concatenate all together Final 1x1 conv '''
def __init__(self, in_dim, reduction_dim=256, output_stride=16, rates=[6, 12, 18]):
super(_AtrousSpatialPyramidPoolingModule, self).__init__()
# Check if we are using distributed BN and use the nn from encoding.nn
# library rather than using standard pytorch.nn
if output_stride == 8:
rates = [2 * r for r in rates]
elif output_stride == 16:
pass
else:
raise 'output stride of {} not supported'.format(output_stride)
self.features = []
# 1x1
self.features.append(
nn.Sequential(nn.Conv2d(in_dim, reduction_dim, kernel_size=1, bias=False),
Norm2d(reduction_dim), nn.ReLU(inplace=True)))
# other rates
for r in rates:
self.features.append(nn.Sequential(
nn.Conv2d(in_dim, reduction_dim, kernel_size=3,
dilation=r, padding=r, bias=False),
Norm2d(reduction_dim),
nn.ReLU(inplace=True)
))
self.features = torch.nn.ModuleList(self.features)
# img level features
self.img_pooling = nn.AdaptiveAvgPool2d(1)
self.img_conv = nn.Sequential(
nn.Conv2d(in_dim, reduction_dim, kernel_size=1, bias=False),
Norm2d(reduction_dim), nn.ReLU(inplace=True))
self.edge_conv = nn.Sequential(
nn.Conv2d(1, reduction_dim, kernel_size=1, bias=False),
Norm2d(reduction_dim), nn.ReLU(inplace=True))
def forward(self, x, edge):
x_size = x.size()
img_features = self.img_pooling(x)
img_features = self.img_conv(img_features)
img_features = F.interpolate(img_features, x_size[2:],
mode='bilinear',align_corners=True)
out = img_features
edge_features = F.interpolate(edge, x_size[2:],
mode='bilinear',align_corners=True)
edge_features = self.edge_conv(edge_features)
out = torch.cat((out, edge_features), 1)
for f in self.features:
y = f(x)
out = torch.cat((out, y), 1)
return out
class GSCNN(nn.Module):
''' Wide_resnet version of DeepLabV3 mod1 pool2 mod2 str2 pool3 mod3-7 structure: [3, 3, 6, 3, 1, 1] channels = [(128, 128), (256, 256), (512, 512), (512, 1024), (512, 1024, 2048), (1024, 2048, 4096)] '''
def __init__(self, num_classes, trunk=None, criterion=None):
super(GSCNN, self).__init__()
self.criterion = criterion
self.num_classes = num_classes
wide_resnet = wider_resnet38_a2(classes=1000, dilation=True)
wide_resnet = torch.nn.DataParallel(wide_resnet)
wide_resnet = wide_resnet.module
self.mod1 = wide_resnet.mod1
self.mod2 = wide_resnet.mod2
self.mod3 = wide_resnet.mod3
self.mod4 = wide_resnet.mod4
self.mod5 = wide_resnet.mod5
self.mod6 = wide_resnet.mod6
self.mod7 = wide_resnet.mod7
self.pool2 = wide_resnet.pool2
self.pool3 = wide_resnet.pool3
self.interpolate = F.interpolate
del wide_resnet
self.dsn1 = nn.Conv2d(64, 1, 1)
self.dsn3 = nn.Conv2d(256, 1, 1)
self.dsn4 = nn.Conv2d(512, 1, 1)
self.dsn7 = nn.Conv2d(4096, 1, 1)
self.res1 = Resnet.BasicBlock(64, 64, stride=1, downsample=None)
self.d1 = nn.Conv2d(64, 32, 1)
self.res2 = Resnet.BasicBlock(32, 32, stride=1, downsample=None)
self.d2 = nn.Conv2d(32, 16, 1)
self.res3 = Resnet.BasicBlock(16, 16, stride=1, downsample=None)
self.d3 = nn.Conv2d(16, 8, 1)
self.fuse = nn.Conv2d(8, 1, kernel_size=1, padding=0, bias=False)
self.cw = nn.Conv2d(2, 1, kernel_size=1, padding=0, bias=False)
self.gate1 = gsc.GatedSpatialConv2d(32, 32)
self.gate2 = gsc.GatedSpatialConv2d(16, 16)
self.gate3 = gsc.GatedSpatialConv2d(8, 8)
self.aspp = _AtrousSpatialPyramidPoolingModule(4096, 256,
output_stride=8)
self.bot_fine = nn.Conv2d(128, 48, kernel_size=1, bias=False)
self.bot_aspp = nn.Conv2d(1280 + 256, 256, kernel_size=1, bias=False)
self.final_seg = nn.Sequential(
nn.Conv2d(256 + 48, 256, kernel_size=3, padding=1, bias=False),
Norm2d(256),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=3, padding=1, bias=False),
Norm2d(256),
nn.ReLU(inplace=True),
nn.Conv2d(256, num_classes, kernel_size=1, bias=False))
self.sigmoid = nn.Sigmoid()
initialize_weights(self.final_seg)
def forward(self, inp, gts=None):
x_size = inp.size()
# res 1
m1 = self.mod1(inp)
# res 2
m2 = self.mod2(self.pool2(m1))
# res 3
m3 = self.mod3(self.pool3(m2))
# res 4-7
m4 = self.mod4(m3)
m5 = self.mod5(m4)
m6 = self.mod6(m5)
m7 = self.mod7(m6)
s3 = F.interpolate(self.dsn3(m3), x_size[2:],
mode='bilinear', align_corners=True)
s4 = F.interpolate(self.dsn4(m4), x_size[2:],
mode='bilinear', align_corners=True)
s7 = F.interpolate(self.dsn7(m7), x_size[2:],
mode='bilinear', align_corners=True)
m1f = F.interpolate(m1, x_size[2:], mode='bilinear', align_corners=True)
im_arr = inp.cpu().numpy().transpose((0,2,3,1)).astype(np.uint8)
canny = np.zeros((x_size[0], 1, x_size[2], x_size[3]))
for i in range(x_size[0]):
canny[i] = cv2.Canny(im_arr[i],10,100)
canny = torch.from_numpy(canny).cuda().float()
cs = self.res1(m1f)
cs = F.interpolate(cs, x_size[2:],
mode='bilinear', align_corners=True)
cs = self.d1(cs)
cs = self.gate1(cs, s3)
cs = self.res2(cs)
cs = F.interpolate(cs, x_size[2:],
mode='bilinear', align_corners=True)
cs = self.d2(cs)
cs = self.gate2(cs, s4)
cs = self.res3(cs)
cs = F.interpolate(cs, x_size[2:],
mode='bilinear', align_corners=True)
cs = self.d3(cs)
cs = self.gate3(cs, s7)
cs = self.fuse(cs)
cs = F.interpolate(cs, x_size[2:],
mode='bilinear', align_corners=True)
edge_out = self.sigmoid(cs)
cat = torch.cat((edge_out, canny), dim=1)
acts = self.cw(cat)
acts = self.sigmoid(acts)
# aspp
x = self.aspp(m7, acts)
dec0_up = self.bot_aspp(x)
dec0_fine = self.bot_fine(m2)
dec0_up = self.interpolate(dec0_up, m2.size()[2:], mode='bilinear',align_corners=True)
dec0 = [dec0_fine, dec0_up]
dec0 = torch.cat(dec0, 1)
dec1 = self.final_seg(dec0)
seg_out = self.interpolate(dec1, x_size[2:], mode='bilinear')
if self.training:
return self.criterion((seg_out, edge_out), gts)
else:
return seg_out, edge_out
6.Intel架构使用
import cv2
from openvino.inference_engine import IECore
# 加载模型和设备
model_xml = 'path/to/model.xml'
model_bin = 'path/to/model.bin'
device = 'CPU' # 可以是 CPU、GPU、MYRIAD、HETERO:FPGA,CPU 等
ie = IECore()
net = ie.read_network(model=model_xml, weights=model_bin)
exec_net = ie.load_network(network=net, device_name=device)
# 读取输入图像
image = cv2.imread('path/to/input_image.jpg')
# 预处理输入图像
input_blob = next(iter(net.input_info))
n, c, h, w = net.input_info[input_blob].input_data.shape
processed_image = cv2.resize(image, (w, h))
processed_image = processed_image.transpose((2, 0, 1))
processed_image = processed_image.reshape((n, c, h, w))
# 执行推理
output_blob = next(iter(net.outputs))
result = exec_net.infer(inputs={input_blob: processed_image})
# 后处理输出结果
output = result[output_blob]
# 显示或保存结果
# ...
标签:视频,nn,self,MTTR,图像,cs,True,Conv2d,size
From: https://blog.51cto.com/u_13277791/8490755