首页 > 其他分享 >SSE图像优化代码-彩色图像转灰度

SSE图像优化代码-彩色图像转灰度

时间:2022-12-28 15:35:02浏览次数:40  
标签:__ 彩色图像 mm epi16 int WT 灰度 SSE m128i

#include <opencv2/opencv.hpp>
using namespace std;

void RGB2Y(unsigned char* Src, unsigned char* Dest, int Width, int Height, int Stride)
{
    const int B_WT = int(0.114 * 256 + 0.5);
    const int G_WT = int(0.587 * 256 + 0.5);
    const int R_WT = 256 - B_WT - G_WT;            //     int(0.299 * 256 + 0.5);

    for (int Y = 0; Y < Height; Y++)
    {
        unsigned char* LinePS = Src + Y * Stride;
        unsigned char* LinePD = Dest + Y * Width;
        for (int X = 0; X < Width; X++, LinePS += 3)
        {
            LinePD[X] = (B_WT * LinePS[0] + G_WT * LinePS[1] + R_WT * LinePS[2]) >> 8;
        }
    }
}
void RGB2Y1(unsigned char* Src, unsigned char* Dest, int Width, int Height, int Stride)
{
    const int B_WT = int(0.114 * 256 + 0.5);
    const int G_WT = int(0.587 * 256 + 0.5);
    const int R_WT = 256 - B_WT - G_WT;            //     int(0.299 * 256 + 0.5);

    for (int Y = 0; Y < Height; Y++)
    {
        unsigned char* LinePS = Src + Y * Stride;
        unsigned char* LinePD = Dest + Y * Width;
        int X = 0;
        for (; X < Width - 4; X += 4, LinePS += 12)
        {
            LinePD[X + 0] = (B_WT * LinePS[0] + G_WT * LinePS[1] + R_WT * LinePS[2]) >> 8;
            LinePD[X + 1] = (B_WT * LinePS[3] + G_WT * LinePS[4] + R_WT * LinePS[5]) >> 8;
            LinePD[X + 2] = (B_WT * LinePS[6] + G_WT * LinePS[7] + R_WT * LinePS[8]) >> 8;
            LinePD[X + 3] = (B_WT * LinePS[9] + G_WT * LinePS[10] + R_WT * LinePS[11]) >> 8;
        }
        for (; X < Width; X++, LinePS += 3)
        {
            LinePD[X] = (B_WT * LinePS[0] + G_WT * LinePS[1] + R_WT * LinePS[2]) >> 8;
        }
    }
}
void RGB2Y2(unsigned char* Src, unsigned char* Dest, int Width, int Height, int Stride)
{
    const int B_WT = int(0.114 * 256 + 0.5);
    const int G_WT = int(0.587 * 256 + 0.5);
    const int R_WT = 256 - B_WT - G_WT;            //     int(0.299 * 256 + 0.5);

    for (int Y = 0; Y < Height; Y++)
    {
        unsigned char* LinePS = Src + Y * Stride;
        unsigned char* LinePD = Dest + Y * Width;
        int X = 0;
        for (; X < Width - 11; X += 12, LinePS += 36)
        {
            //_mm_mullo_epi16(_m128i S0, _m128i S1) -- 返回一个_m128i的寄存器, 它含有8个16位整数,分别为S0和S1对应位置的16位的整数相乘结果的低16bit数据
            //_mm_cvtepu8_epi16(_m128i S0) -- 返回一个_m128i的寄存器, 它含有8个16位整数,无符号8位到有符号16位的转化
            //_mm_loadu_si128(int *p) -- 返回一个_m128i的寄存器,它含有16个8位的整数,不需要地址是8字节对齐
            //_mm_setr_epi16() -- 返回一个_m128i的寄存器,它含有8个16位的整数,不需要地址是16字节对齐
            __m128i p1aL = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(LinePS + 0))), _mm_setr_epi16(B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT));
            __m128i p2aL = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(LinePS + 1))), _mm_setr_epi16(G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT));
            __m128i p3aL = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(LinePS + 2))), _mm_setr_epi16(R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT));

            __m128i p1aH = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(LinePS + 8))), _mm_setr_epi16(R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT));
            __m128i p2aH = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(LinePS + 9))), _mm_setr_epi16(B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT));
            __m128i p3aH = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(LinePS + 10))), _mm_setr_epi16(G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT));

            __m128i p1bL = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(LinePS + 18))), _mm_setr_epi16(B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT));
            __m128i p2bL = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(LinePS + 19))), _mm_setr_epi16(G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT));
            __m128i p3bL = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(LinePS + 20))), _mm_setr_epi16(R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT));

            __m128i p1bH = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(LinePS + 26))), _mm_setr_epi16(R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT));
            __m128i p2bH = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(LinePS + 27))), _mm_setr_epi16(B_WT, G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT));
            __m128i p3bH = _mm_mullo_epi16(_mm_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(LinePS + 28))), _mm_setr_epi16(G_WT, R_WT, B_WT, G_WT, R_WT, B_WT, G_WT, R_WT));

            //_mm_add_epi16(_m128i S0, _m128i S1) -- 返回一个_m128i的寄存器, 它含有8个16位整数,分别为S0和S1对应位置的16位的整数相加的结果
            __m128i sumaL = _mm_add_epi16(p3aL, _mm_add_epi16(p1aL, p2aL));
            __m128i sumaH = _mm_add_epi16(p3aH, _mm_add_epi16(p1aH, p2aH));
            __m128i sumbL = _mm_add_epi16(p3bL, _mm_add_epi16(p1bL, p2bL));
            __m128i sumbH = _mm_add_epi16(p3bH, _mm_add_epi16(p1bH, p2bH));

            //_mm_srli_epi16(_m128i S0, int _Count) -- 返回一个_m128i的寄存器, 它含有8个16位整数,将S0中的8个16bit整数按照_Count进行相同的逻辑右移
            __m128i sclaL = _mm_srli_epi16(sumaL, 8);
            __m128i sclaH = _mm_srli_epi16(sumaH, 8);
            __m128i sclbL = _mm_srli_epi16(sumbL, 8);
            __m128i sclbH = _mm_srli_epi16(sumbH, 8);

            //_mm_shuffle_epi8(_m128i S0, _m128i S1) -- 返回一个_m128i的寄存器, 它含有16个8位整数, 将S0中的数据根据S1掩膜进行重组
            __m128i shftaL = _mm_shuffle_epi8(sclaL, _mm_setr_epi8(0, 6, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1));
            __m128i shftaH = _mm_shuffle_epi8(sclaH, _mm_setr_epi8(-1, -1, -1, 2, 8, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1));
            __m128i shftbL = _mm_shuffle_epi8(sclbL, _mm_setr_epi8(-1, -1, -1, -1, -1, -1, 0, 6, 12, -1, -1, -1, -1, -1, -1, -1));
            __m128i shftbH = _mm_shuffle_epi8(sclbH, _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 8, 14, -1, -1, -1, -1));

            //_mm_or_si128(_m128i S0, _m128i S1)-- 返回一个_m128i的寄存器, 它含有16个8位整数, 分别为S0和S1对应位置的8位的整数进行逻辑或位运算的结果
            __m128i accumL = _mm_or_si128(shftaL, shftbL);
            __m128i accumH = _mm_or_si128(shftaH, shftbH);
            __m128i h3 = _mm_or_si128(accumL, accumH);

            //_mm_storeu_si128((__m128i *) p, __m128i S0) -- 将一个_m128i寄存器的值写入指针
            _mm_storeu_si128((__m128i*)(LinePD + X), h3);
        }    
    }
}
int main()
{
    cv::Mat img = cv::imread("C:\\Users\\Administrator\\Desktop\\test.png");    
    cv::resize(img, img, cv::Size{ 1920, 1280 });
    cv::Mat gray = cv::Mat(img.rows, img.cols , CV_8UC1);    
    
    const int Cnt = 100;
    using namespace chrono;
    auto start = system_clock::now();
    for (int m = 0; m < Cnt; ++m) {
        RGB2Y1(img.data, gray.data, img.rows, img.cols, img.rows * img.channels());
    }
    auto end = system_clock::now();
    auto duration = duration_cast<microseconds>(end - start) / Cnt;
    cout << "time cost:"
        << double(duration.count()) * microseconds::period::num / microseconds::period::den * 1000
        << "ms" << endl;
    
    return 0;
}

ref:  https://blog.csdn.net/qq_48034474/article/details/123404894 https://cloud.tencent.com/developer/article/1011903

实测PC端SSE的加速效果不太好,速度慢了近一倍,不知道是哪里的原因。

 

标签:__,彩色图像,mm,epi16,int,WT,灰度,SSE,m128i
From: https://www.cnblogs.com/fourmi/p/17010250.html

相关文章

  • WebAssembly _ 转发
    原文:https://www.cnblogs.com/linguoguo/p/12125584.html 最近,WebAssembly在JavaScript圈非常的火!人们都在谈论它多么多么快,怎样怎样改变Web开发领域。但是没有......
  • bash: fork: retry: No child processes
    1、问题描述登录root用户,切换业务用户的时候卡顿准备重启jar包的时候报错:"fork:retry:Nochildprocesses""Resourcetemporarilyunavailable"服务器做过连接数设置vi/e......
  • 从三万英尺看全链路灰度
    作者:卜比全链路灰度是微服务领域,很实用的企业级场景下的技术能力。从本期开始,我们将通过《全链路灰度:自顶向下的方法》的系列文章,由远及近的剖析全链路灰度全貌,系列文章分为......
  • select()函数以及FD_ZERO、FD_SET、FD_CLR、FD_ISSET
    select函数用于在非阻塞中,当一个套接字或一组套接字有信号时通知你,系统提供select函数来实现多路复用输入/输出模型,原型: intselect(intmaxfd,fd_set*rdset,fd_set*wr......
  • 基于OpenVINO的端到端DL网络-Tesseract5+VS2017+win10源码编译攻略
    一,记录我目前在win10X64和VS2017的环境下成功编译Tesseract5.0的方式;二,记录在VS2017C++工程中调用Tesseract4.0的方法;三,记录编译和调用Tesseract4.0过程中踩到的坑和相......
  • 集成利用tesseract.exe进行ocr
      ocr是一个宽泛的概念。市场上面ocr将一直是一个不断发展、需求强烈的方向。  我认为,从难度上区分,中文ocr难于英文ocr;手写ocr难于印刷ocr。所以两两组合,中文手写体......
  • SYSU-SSE 3D游戏编程与设计 学习笔记(6)--模型与动画
    智能巡逻兵游戏代码:游戏代码游戏演示视频:演示视频编程内容实现思路组织游戏资源,将地图、巡逻兵和玩家做成预制,其中巡逻兵和玩家模型来自于AssetStore玩家巡逻......
  • Argocd rollout 蓝绿发布步以及灰度发布步骤图形讲解
    灰度发布1、5个pod2、百分之二十灰度3、全部新版蓝绿发布1、原始应用2、部署预览服务3、流量切换删除旧pod......
  • R语言和Python用泊松过程扩展:霍克斯过程Hawkes Processes分析比特币交易数据订单到达
    全文下载链接:http://tecdat.cn/?p=25880 最近我们被客户要求撰写关于泊松过程的研究报告,包括一些图形和统计输出。本文描述了一个模型,该模型解释了交易的聚集到达,并展示......
  • Unity AssetBundle 所涉及的CRC
    CRC的作用校验文件是否被篡改过!加载ab包时使用crc进行校验比如publicstaticAssetBundleLoadFromFile(stringpath,uintcrc,ulongoffset),注意:这里传入的crc,必......