基于MTTR——视频图像分割

标签：视频 nn self MTTR 图像 cs True Conv2d size

1.图像分割目前存在的问题

现在的deep CNN能提取图像中很多feature map,比如图像的纹理、颜色、形状特征,但对于分割任务来说这并不是我们全部需要的,分割任务最理想的是根据边界和形状信息进行识别,如果信息流中包含了很多颜色、纹理可能会导致识别问题。

2.该网络提出的解决方法

针对上述问题,作者提出了双流CNN(two-stream CNN)结构,也就是将shape stream单独分离出来,与常规的CNN进行并行操作,最后将两者学习到的特征通过ASPP进行融合处理,从而提高了语义分割的性能。

3.整体网络结构

图片1.png

End-to-End Referring Video Object Segmentation with Multimodal Transformers 的网络结构主要包括两个部分：指代表达理解模块和视频对象分割模块。 <br> 指代表达理解模块：

指代表达理解模块主要由两个部分组成：自然语言处理模块和视觉特征提取模块。其中，自然语言处理模块使用 BERT 模型来对自然语言指令进行编码，生成指令向量。视觉特征提取模块则使用 ResNet-101 网络来提取视频帧的视觉特征。

这两个模块的输出会被传递到多模态 Transformer 模型中，进行融合和编码。多模态 Transformer 模型是一个基于自注意力机制的神经网络，能够同时处理文本和图像信息并生成相应的特征向量。 <br> 视频对象分割模块：

视频对象分割模块主要由两个部分组成：空间掩模生成模块和掩模优化模块。空间掩模生成模块使用多头注意力机制来生成初始的空间掩模，表示视频中目标对象的位置和形状。掩模优化模块则使用迭代的方式来对空间掩模进行优化，以提高分割的准确性。

整个网络结构采用端到端的训练方式，可以直接从自然语言指令和视频帧中学习到对象分割的知识。同时，多模态 Transformer 模型的引入使得网络能够处理文本和图像的复杂关系，从而提高了分割的精度和鲁棒性。

4.测试结果如下：

图片2.png

5.损失函数设计

本次是多任务学习(边缘检测指导语义分割),total function由四部分组成:两个任务的loss function和两个任务的正则化项。

loss function 图片3.png 其中：图片4.png 试验细节：

batch_size=16,
lr=0.01多项式衰减
损失参数：20,1,1,1
resolution:800x800

代码

import torchimport torch.nn.functional as Ffrom torch import nnfrom network import SEresnextfrom network import Resnetfrom network.wider_resnet import wider_resnet38_a2from config import cfgfrom network.mynn import initialize_weights, Norm2dfrom torch.autograd import Variable
from my_functionals import GatedSpatialConv as gsc
import cv2import numpy as np
class Crop(nn.Module):
    def __init__(self, axis, offset):
        super(Crop, self).__init__()
        self.axis = axis
        self.offset = offset

    def forward(self, x, ref):
        """        :param x: input layer        :param ref: reference usually data in        :return:        """
        for axis in range(self.axis, x.dim()):
            ref_size = ref.size(axis)
            indices = torch.arange(self.offset, self.offset + ref_size).long()
            indices = x.data.new().resize_(indices.size()).copy_(indices).long()
            x = x.index_select(axis, Variable(indices))
        return x

class MyIdentity(nn.Module):
    def __init__(self, axis, offset):
        super(MyIdentity, self).__init__()
        self.axis = axis
        self.offset = offset

    def forward(self, x, ref):
        """        :param x: input layer        :param ref: reference usually data in        :return:        """
        return x
class SideOutputCrop(nn.Module):
    """    This is the original implementation ConvTranspose2d (fixed) and crops    """

    def __init__(self, num_output, kernel_sz=None, stride=None, upconv_pad=0, do_crops=True):
        super(SideOutputCrop, self).__init__()
        self._do_crops = do_crops
        self.conv = nn.Conv2d(num_output, out_channels=1, kernel_size=1, stride=1, padding=0, bias=True)

        if kernel_sz is not None:
            self.upsample = True
            self.upsampled = nn.ConvTranspose2d(1, out_channels=1, kernel_size=kernel_sz, stride=stride,
                                                padding=upconv_pad,
                                                bias=False)
            ##doing crops
            if self._do_crops:
                self.crops = Crop(2, offset=kernel_sz // 4)
            else:
                self.crops = MyIdentity(None, None)
        else:
            self.upsample = False

    def forward(self, res, reference=None):
        side_output = self.conv(res)
        if self.upsample:
            side_output = self.upsampled(side_output)
            side_output = self.crops(side_output, reference)

        return side_output

class _AtrousSpatialPyramidPoolingModule(nn.Module):
    '''    operations performed:      1x1 x depth      3x3 x depth dilation 6      3x3 x depth dilation 12      3x3 x depth dilation 18      image pooling      concatenate all together      Final 1x1 conv    '''

    def __init__(self, in_dim, reduction_dim=256, output_stride=16, rates=[6, 12, 18]):
        super(_AtrousSpatialPyramidPoolingModule, self).__init__()

        # Check if we are using distributed BN and use the nn from encoding.nn
        # library rather than using standard pytorch.nn

        if output_stride == 8:
            rates = [2 * r for r in rates]
        elif output_stride == 16:
            pass
        else:
            raise 'output stride of {} not supported'.format(output_stride)

        self.features = []
        # 1x1
        self.features.append(
            nn.Sequential(nn.Conv2d(in_dim, reduction_dim, kernel_size=1, bias=False),
                          Norm2d(reduction_dim), nn.ReLU(inplace=True)))
        # other rates
        for r in rates:
            self.features.append(nn.Sequential(
                nn.Conv2d(in_dim, reduction_dim, kernel_size=3,
                          dilation=r, padding=r, bias=False),
                Norm2d(reduction_dim),
                nn.ReLU(inplace=True)
            ))
        self.features = torch.nn.ModuleList(self.features)

        # img level features
        self.img_pooling = nn.AdaptiveAvgPool2d(1)
        self.img_conv = nn.Sequential(
            nn.Conv2d(in_dim, reduction_dim, kernel_size=1, bias=False),
            Norm2d(reduction_dim), nn.ReLU(inplace=True))
        self.edge_conv = nn.Sequential(
            nn.Conv2d(1, reduction_dim, kernel_size=1, bias=False),
            Norm2d(reduction_dim), nn.ReLU(inplace=True))
         

    def forward(self, x, edge):
        x_size = x.size()

        img_features = self.img_pooling(x)
        img_features = self.img_conv(img_features)
        img_features = F.interpolate(img_features, x_size[2:],
                                     mode='bilinear',align_corners=True)
        out = img_features

        edge_features = F.interpolate(edge, x_size[2:],
                                      mode='bilinear',align_corners=True)
        edge_features = self.edge_conv(edge_features)
        out = torch.cat((out, edge_features), 1)

        for f in self.features:
            y = f(x)
            out = torch.cat((out, y), 1)
        return out
class GSCNN(nn.Module):
    '''    Wide_resnet version of DeepLabV3    mod1    pool2    mod2 str2    pool3    mod3-7      structure: [3, 3, 6, 3, 1, 1]      channels = [(128, 128), (256, 256), (512, 512), (512, 1024), (512, 1024, 2048),                  (1024, 2048, 4096)]    '''

    def __init__(self, num_classes, trunk=None, criterion=None):
        
        super(GSCNN, self).__init__()
        self.criterion = criterion
        self.num_classes = num_classes

        wide_resnet = wider_resnet38_a2(classes=1000, dilation=True)
        wide_resnet = torch.nn.DataParallel(wide_resnet)
        
        wide_resnet = wide_resnet.module
        self.mod1 = wide_resnet.mod1
        self.mod2 = wide_resnet.mod2
        self.mod3 = wide_resnet.mod3
        self.mod4 = wide_resnet.mod4
        self.mod5 = wide_resnet.mod5
        self.mod6 = wide_resnet.mod6
        self.mod7 = wide_resnet.mod7
        self.pool2 = wide_resnet.pool2
        self.pool3 = wide_resnet.pool3
        self.interpolate = F.interpolate
        del wide_resnet

        self.dsn1 = nn.Conv2d(64, 1, 1)
        self.dsn3 = nn.Conv2d(256, 1, 1)
        self.dsn4 = nn.Conv2d(512, 1, 1)
        self.dsn7 = nn.Conv2d(4096, 1, 1)

        self.res1 = Resnet.BasicBlock(64, 64, stride=1, downsample=None)
        self.d1 = nn.Conv2d(64, 32, 1)
        self.res2 = Resnet.BasicBlock(32, 32, stride=1, downsample=None)
        self.d2 = nn.Conv2d(32, 16, 1)
        self.res3 = Resnet.BasicBlock(16, 16, stride=1, downsample=None)
        self.d3 = nn.Conv2d(16, 8, 1)
        self.fuse = nn.Conv2d(8, 1, kernel_size=1, padding=0, bias=False)

        self.cw = nn.Conv2d(2, 1, kernel_size=1, padding=0, bias=False)

        self.gate1 = gsc.GatedSpatialConv2d(32, 32)
        self.gate2 = gsc.GatedSpatialConv2d(16, 16)
        self.gate3 = gsc.GatedSpatialConv2d(8, 8)
         
        self.aspp = _AtrousSpatialPyramidPoolingModule(4096, 256,
                                                       output_stride=8)

        self.bot_fine = nn.Conv2d(128, 48, kernel_size=1, bias=False)
        self.bot_aspp = nn.Conv2d(1280 + 256, 256, kernel_size=1, bias=False)

        self.final_seg = nn.Sequential(
            nn.Conv2d(256 + 48, 256, kernel_size=3, padding=1, bias=False),
            Norm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1, bias=False),
            Norm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, num_classes, kernel_size=1, bias=False))

        self.sigmoid = nn.Sigmoid()
        initialize_weights(self.final_seg)

    def forward(self, inp, gts=None):

        x_size = inp.size() 

        # res 1
        m1 = self.mod1(inp)

        # res 2
        m2 = self.mod2(self.pool2(m1))

        # res 3
        m3 = self.mod3(self.pool3(m2))

        # res 4-7
        m4 = self.mod4(m3)
        m5 = self.mod5(m4)
        m6 = self.mod6(m5)
        m7 = self.mod7(m6) 

        s3 = F.interpolate(self.dsn3(m3), x_size[2:],
                            mode='bilinear', align_corners=True)
        s4 = F.interpolate(self.dsn4(m4), x_size[2:],
                            mode='bilinear', align_corners=True)
        s7 = F.interpolate(self.dsn7(m7), x_size[2:],
                            mode='bilinear', align_corners=True)
        
        m1f = F.interpolate(m1, x_size[2:], mode='bilinear', align_corners=True)

        im_arr = inp.cpu().numpy().transpose((0,2,3,1)).astype(np.uint8)
        canny = np.zeros((x_size[0], 1, x_size[2], x_size[3]))
        for i in range(x_size[0]):
            canny[i] = cv2.Canny(im_arr[i],10,100)
        canny = torch.from_numpy(canny).cuda().float()

        cs = self.res1(m1f)
        cs = F.interpolate(cs, x_size[2:],
                           mode='bilinear', align_corners=True)
        cs = self.d1(cs)
        cs = self.gate1(cs, s3)
        cs = self.res2(cs)
        cs = F.interpolate(cs, x_size[2:],
                           mode='bilinear', align_corners=True)
        cs = self.d2(cs)
        cs = self.gate2(cs, s4)
        cs = self.res3(cs)
        cs = F.interpolate(cs, x_size[2:],
                           mode='bilinear', align_corners=True)
        cs = self.d3(cs)
        cs = self.gate3(cs, s7)
        cs = self.fuse(cs)
        cs = F.interpolate(cs, x_size[2:],
                           mode='bilinear', align_corners=True)
        edge_out = self.sigmoid(cs)
        cat = torch.cat((edge_out, canny), dim=1)
        acts = self.cw(cat)
        acts = self.sigmoid(acts)

        # aspp
        x = self.aspp(m7, acts)
        dec0_up = self.bot_aspp(x)

        dec0_fine = self.bot_fine(m2)
        dec0_up = self.interpolate(dec0_up, m2.size()[2:], mode='bilinear',align_corners=True)
        dec0 = [dec0_fine, dec0_up]
        dec0 = torch.cat(dec0, 1)

        dec1 = self.final_seg(dec0)  
        seg_out = self.interpolate(dec1, x_size[2:], mode='bilinear')            
       
        if self.training:
            return self.criterion((seg_out, edge_out), gts)              
        else:
            return seg_out, edge_out

6.Intel架构使用

图片5.png

import cv2
from openvino.inference_engine import IECore

# 加载模型和设备
model_xml = 'path/to/model.xml'
model_bin = 'path/to/model.bin'
device = 'CPU'  # 可以是 CPU、GPU、MYRIAD、HETERO:FPGA,CPU 等

ie = IECore()
net = ie.read_network(model=model_xml, weights=model_bin)
exec_net = ie.load_network(network=net, device_name=device)

# 读取输入图像
image = cv2.imread('path/to/input_image.jpg')

# 预处理输入图像
input_blob = next(iter(net.input_info))
n, c, h, w = net.input_info[input_blob].input_data.shape
processed_image = cv2.resize(image, (w, h))
processed_image = processed_image.transpose((2, 0, 1))
processed_image = processed_image.reshape((n, c, h, w))

# 执行推理
output_blob = next(iter(net.outputs))
result = exec_net.infer(inputs={input_blob: processed_image})

# 后处理输出结果
output = result[output_blob]

# 显示或保存结果
# ...

标签：视频,nn,self,MTTR,图像,cs,True,Conv2d,size
From： https://blog.51cto.com/u_13277791/8490755