首页 > 其他分享 >AV_PIX_FMT_CUDA 数据转 RGB

AV_PIX_FMT_CUDA 数据转 RGB

时间:2022-09-21 00:34:29浏览次数:88  
标签:FMT cudaStatus gpuFrame PIX RGB MUL constHueColorSpaceMat2 chromaCr chromaCb

ffmpeg的cuvid解码结果数据格式为 AV_PIX_FMT_CUDA,实际使用中后续接算法需要转为RGB。算法跑在显卡上,解码也在显卡上,所以转换也定为直接在显卡上进行。

 

关于ffmpeg的cuvid解码的网上博客写的比较多,个人参考的一个比较好的 https://blog.csdn.net/qq_40116098/article/details/120704340

这里只记录解码出来的AV_PIX_FMT_CUDA格式的AVFrame数据转为RGB数据。

基于CUDA的代码实现:

#include "cuda_kernels.h"

#include <builtin_types.h>
#include "common/inc/helper_cuda_drvapi.h"

typedef unsigned char   uint8;
typedef unsigned int    uint32;
typedef int             int32;

#define COLOR_COMPONENT_MASK            0x3FF
#define COLOR_COMPONENT_BIT_SIZE        10

namespace cuda_common
{

#define MUL(x,y)    ((x)*(y))

    __constant__ float  constHueColorSpaceMat2[9];  //默认分配到0卡上,未找到分配到指定卡上设置方法,当前也未用到,先注释

    __device__ void YUV2RGB2(uint32 *yuvi, float *red, float *green, float *blue)
    {
        float luma, chromaCb, chromaCr;

        // Prepare for hue adjustment
        luma = (float)yuvi[0];
        chromaCb = (float)((int32)yuvi[1] - 512.0f);
        chromaCr = (float)((int32)yuvi[2] - 512.0f);


        // Convert YUV To RGB with hue adjustment
        *red = MUL(luma, constHueColorSpaceMat2[0]) +
            MUL(chromaCb, constHueColorSpaceMat2[1]) +
            MUL(chromaCr, constHueColorSpaceMat2[2]);
        *green = MUL(luma, constHueColorSpaceMat2[3]) +
            MUL(chromaCb, constHueColorSpaceMat2[4]) +
            MUL(chromaCr, constHueColorSpaceMat2[5]);
        *blue = MUL(luma, constHueColorSpaceMat2[6]) +
            MUL(chromaCb, constHueColorSpaceMat2[7]) +
            MUL(chromaCr, constHueColorSpaceMat2[8]);

    }

    __device__ unsigned char clip_v(int x, int min_val, int  max_val) {
        if (x>max_val) {
            return max_val;
        }
        else if (x<min_val) {
            return min_val;
        }
        else {
            return x;
        }
    }

        // CUDA kernel for outputing the final RGB output from NV12;

    extern "C"
        __global__ void CUDAToBGR_drvapi(uint32 *dataY, uint32 *dataUV, size_t pitchY, size_t pitchUV, unsigned char *dstImage, int width, int height)
    {

        int32 x, y;

        // Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread
        x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);
        y = blockIdx.y *  blockDim.y + threadIdx.y;

        if (x >= width)
        {
            return; 
        }

        if (y >= height)
        {
            return; 
        }

        uint32 yuv101010Pel[2];
        uint8 *srcImageU8_Y = (uint8 *)dataY;
        uint8 *srcImageU8_UV = (uint8 *)dataUV;

        // Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way.
        // if we move to texture we could read 4 luminance values
        yuv101010Pel[0] = (srcImageU8_Y[y * pitchY + x]) << 2;
        yuv101010Pel[1] = (srcImageU8_Y[y * pitchY + x + 1]) << 2;

        int32 y_chroma = y >> 1;

        if (y & 1)  // odd scanline ?
        {
            uint32 chromaCb;
            uint32 chromaCr;

            chromaCb = srcImageU8_UV[y_chroma * pitchUV + x];
            chromaCr = srcImageU8_UV[y_chroma * pitchUV + x + 1];

            if (y_chroma < ((height >> 1) - 1)) // interpolate chroma vertically
            {
                chromaCb = (chromaCb + srcImageU8_UV[(y_chroma + 1) * pitchUV + x] + 1) >> 1;
                chromaCr = (chromaCr + srcImageU8_UV[(y_chroma + 1) * pitchUV + x + 1] + 1) >> 1;
            }

            yuv101010Pel[0] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2));
            yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));

            yuv101010Pel[1] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2));
            yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
        }
        else
        {
            yuv101010Pel[0] |= ((uint32)srcImageU8_UV[y_chroma * pitchUV + x] << (COLOR_COMPONENT_BIT_SIZE + 2));
            yuv101010Pel[0] |= ((uint32)srcImageU8_UV[y_chroma * pitchUV + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));

            yuv101010Pel[1] |= ((uint32)srcImageU8_UV[y_chroma * pitchUV + x] << (COLOR_COMPONENT_BIT_SIZE + 2));
            yuv101010Pel[1] |= ((uint32)srcImageU8_UV[y_chroma * pitchUV + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
        }

        // this steps performs the color conversion
        uint32 yuvi[6];
        float red[2], green[2], blue[2];

        yuvi[0] = (yuv101010Pel[0] & COLOR_COMPONENT_MASK);
        yuvi[1] = ((yuv101010Pel[0] >> COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
        yuvi[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);

        yuvi[3] = (yuv101010Pel[1] & COLOR_COMPONENT_MASK);
        yuvi[4] = ((yuv101010Pel[1] >> COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
        yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);

        // YUV to RGB Transformation conversion
        YUV2RGB2(&yuvi[0], &red[0], &green[0], &blue[0]);
        YUV2RGB2(&yuvi[3], &red[1], &green[1], &blue[1]);


        dstImage[y * width * 3 + x * 3] = clip_v(blue[0] * 0.25,0 ,255);
        dstImage[y * width * 3 + x * 3 + 3] = clip_v(blue[1] * 0.25,0, 255);

        dstImage[width * y * 3 + x * 3 + 1] = clip_v(green[0] * 0.25,0 ,255);
        dstImage[width * y * 3 + x * 3 + 4] = clip_v(green[1] * 0.25,0, 255);

        dstImage[width * y * 3 + x * 3 + 2] = clip_v(red[0] * 0.25, 0, 255);
        dstImage[width * y * 3 + x * 3 + 5] = clip_v(red[1] * 0.25,0 ,255);
    }

    cudaError_t setColorSpace2(e_ColorSpace CSC, float hue)
    {

        float hueSin = sin(hue);
        float hueCos = cos(hue);

        float hueCSC[9];
        if (CSC == ITU601)
        {
            //CCIR 601
            hueCSC[0] = 1.1644f;
            hueCSC[1] = hueSin * 1.5960f;
            hueCSC[2] = hueCos * 1.5960f;
            hueCSC[3] = 1.1644f;
            hueCSC[4] = (hueCos * -0.3918f) - (hueSin * 0.8130f);
            hueCSC[5] = (hueSin *  0.3918f) - (hueCos * 0.8130f);
            hueCSC[6] = 1.1644f;
            hueCSC[7] = hueCos *  2.0172f;
            hueCSC[8] = hueSin * -2.0172f;
        }
        else if (CSC == ITU709)
        {
            //CCIR 709
            hueCSC[0] = 1.0f;
            hueCSC[1] = hueSin * 1.57480f;
            hueCSC[2] = hueCos * 1.57480f;
            hueCSC[3] = 1.0;
            hueCSC[4] = (hueCos * -0.18732f) - (hueSin * 0.46812f);
            hueCSC[5] = (hueSin *  0.18732f) - (hueCos * 0.46812f);
            hueCSC[6] = 1.0f;
            hueCSC[7] = hueCos *  1.85560f;
            hueCSC[8] = hueSin * -1.85560f;
        }

        cudaError_t cudaStatus = cudaMemcpyToSymbol(constHueColorSpaceMat2, hueCSC, 9 * sizeof(float), 0, cudaMemcpyHostToDevice);
        float tmpf[9];
        memset(tmpf, 0, 9 * sizeof(float));
        cudaMemcpyFromSymbol(tmpf, constHueColorSpaceMat2, 9 * sizeof(float), 0, ::cudaMemcpyDefault);
        cudaDeviceSynchronize();

        if (cudaStatus != cudaSuccess) {
            fprintf(stderr, "cudaMemcpyToSymbol failed: %s\n", cudaGetErrorString(cudaStatus));
        }

        return cudaStatus;
    }

    cudaError_t CUDAToBGR(CUdeviceptr dataY, CUdeviceptr dataUV, size_t pitchY, size_t pitchUV, unsigned char* d_dstRGB, int width, int height)
    {
        dim3 block(32, 16, 1);
        dim3 grid((width + (2 * block.x - 1)) / (2 * block.x), (height + (block.y - 1)) / block.y, 1);
        CUDAToBGR_drvapi << < grid, block >> >((uint32 *)dataY, (uint32 *)dataUV, pitchY, pitchUV, d_dstRGB, width, height);
        cudaError_t cudaStatus = cudaGetLastError();
        if (cudaStatus != cudaSuccess) {
            fprintf(stderr, "NV12ToRGB_drvapi launch failed: %s\n", cudaGetErrorString(cudaStatus));
            return cudaStatus;
        }

        cudaStatus = cudaDeviceSynchronize();
        if (cudaStatus != cudaSuccess) {
            fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching NV12ToRGB_drvapi !\n", cudaStatus);
            return cudaStatus;
        }

        return cudaStatus;
    }
}

对外接口为 CUDAToBGR 函数,核心实现在 CUDAToBGR_drvapi 函数中。

其中输入参数把Y和UV分开了,其原因是ffmpeg对 AV_PIX_FMT_CUDA 格式数据,在AVFrame中data[0]和data[1]都存在值,AV_PIX_FMT_CUDA 数据其实应该就是显卡上的NV12数据,所以个人推测data[0]是Y数据,data[1]是UV数据。实际转换的效果基本证明这个推测是对的。

调用以实现转换:

if (gpuFrame->format == AV_PIX_FMT_CUDA)
{
     cudaError_t cudaStatus;
    if(pHwRgb == nullptr){
        cudaStatus = cudaMalloc((void **)&pHwRgb, 3 * gpuFrame->width * gpuFrame->height * sizeof(unsigned char));
    }
    cudaStatus = cuda_common::CUDAToBGR((CUdeviceptr)gpuFrame->data[0],(CUdeviceptr)gpuFrame->data[1], gpuFrame->linesize[0], gpuFrame->linesize[1], pHwRgb, gpuFrame->width, gpuFrame->height);
    cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        cout << "CUDAToBGR failed !!!" << endl;
        return;
    }
    // saveJpeg("/home/cmhu/FFNvDecoder/a.jpg", pHwRgb, gpuFrame->width, gpuFrame->height);  // 验证 CUDAToRGB 
}

其中 pHwRgb 是unsigned char* ,用cudaMalloc分配的在显卡上的数据。 gpuFrame 是 AVFrame *,是用cuvid硬解出来的帧数据。

转换后的数据保存下来的jpeg图片样例:

结果

可以看到效果良好。

 

 

 

 

 

 

 

 

 

 

 


 

标签:FMT,cudaStatus,gpuFrame,PIX,RGB,MUL,constHueColorSpaceMat2,chromaCr,chromaCb
From: https://www.cnblogs.com/betterwgo/p/16712604.html

相关文章

  • ubuntu1804 pixel xl 编译安装lineage-18.1
    官方文档https://wiki.lineageos.org/devices/marlin/build下载源码repoinit-uhttps://github.com/LineageOS/android.git-blineage-18.1reposync-c此处需要梯......
  • Yet Another RGB Sequence(排列组合)
    题意问有多少字符串满足如下要求:只包含R、G、B三种字符,并且数量分别是\(A\),\(B\),\(C\)。包含\(K\)个连续子串RG。题目链接:https://atcoder.jp/contests/abc266/tasks......
  • Basler相机Bayer格式转Qt RGB888
    无论什么品牌的相机,Bayer转RGB都涉及到插值,因此建议使用官方SDK里的函数进行转换。针对Basler相机,代码如下:voidBaslerCamera::toQImage(CGrabResultPtrptrGrabResult,......
  • 莫兰迪色系色卡RGB对照表
    【转】莫兰迪色系色卡RGB对照表莫兰迪色系,广义上是指饱和度低、带有灰调的颜色系列,大家也常把它称为“高级灰”。近年流行的雾蓝、烟粉、豆绿、冷咖等,都是由莫兰迪色系所......
  • NVIDIA 解释了为什么它认为 Pixar 发明的协议是“虚拟世界的 HTML”
    NVIDIA解释了为什么它认为Pixar发明的协议是“虚拟世界的HTML”NVIDIA是科技行业的重要参与者之一,它正在推动通用场景描述协议作为虚拟世界中可互操作内容和体验的基......
  • 花了两天时间搞定的bug:Unable to convert the Pixel Data as the 'pylibjpeg-libjpeg'
    BUG发生场景:在使用Pydicom包读取含下列压缩类型中的JPEGLossless(Process14,SV1)的dcm图像时,由下图可知需要安装GDCM或者pylibjpeg才能进行正常读取。然而,笔者不仅仅安......
  • rgba转化为未调整的16进制
    根据之前的讲解也就是这篇:https://www.cnblogs.com/heibaiqi/p/16612375.html,了解到16进制和rgba的关系,但是我碰到rgba转化为未调整的16进制时,居然搜不到,就一个简简单单的......
  • 16进制和rgba之间的联系
    对于大部分前端用户这两个内容咱们都了解,rgba 最后的a设置透明度,但是通过16进制设置透明度咱们并不是常用,但是不影响设置,比如网易云的红色为:#E20000如果不用rbga咱们之......
  • CF1196D2 RGB Substring (hard version)
    https://www.luogu.com.cn/problem/CF1196D2前缀和黄色题思路:看当前输入要被修改的这个字符串的第i位,是否与'R','G','B'三个中的一个相等,不相等的另外两个则增加一次修......
  • RGB转YUV
    根据前面YCbCr转RGB章节,可以知道YUV与RGB互转的公式。这里不再赘述,直接上RGB转YUV的代码。RGB2YUV.v1//*************************************************......