AV_PIX_FMT_CUDA 数据转 RGB

标签：FMT cudaStatus gpuFrame PIX RGB MUL constHueColorSpaceMat2 chromaCr chromaCb

ffmpeg的cuvid解码结果数据格式为 AV_PIX_FMT_CUDA，实际使用中后续接算法需要转为RGB。算法跑在显卡上，解码也在显卡上，所以转换也定为直接在显卡上进行。

关于ffmpeg的cuvid解码的网上博客写的比较多，个人参考的一个比较好的 https://blog.csdn.net/qq_40116098/article/details/120704340

这里只记录解码出来的AV_PIX_FMT_CUDA格式的AVFrame数据转为RGB数据。

基于CUDA的代码实现：

#include "cuda_kernels.h"

#include <builtin_types.h>
#include "common/inc/helper_cuda_drvapi.h"

typedef unsigned char   uint8;
typedef unsigned int    uint32;
typedef int             int32;

#define COLOR_COMPONENT_MASK            0x3FF
#define COLOR_COMPONENT_BIT_SIZE        10

namespace cuda_common
{

#define MUL(x,y)    ((x)*(y))

    __constant__ float  constHueColorSpaceMat2[9];  //默认分配到0卡上，未找到分配到指定卡上设置方法，当前也未用到，先注释

    __device__ void YUV2RGB2(uint32 *yuvi, float *red, float *green, float *blue)
    {
        float luma, chromaCb, chromaCr;

        // Prepare for hue adjustment
        luma = (float)yuvi[0];
        chromaCb = (float)((int32)yuvi[1] - 512.0f);
        chromaCr = (float)((int32)yuvi[2] - 512.0f);


        // Convert YUV To RGB with hue adjustment
        *red = MUL(luma, constHueColorSpaceMat2[0]) +
            MUL(chromaCb, constHueColorSpaceMat2[1]) +
            MUL(chromaCr, constHueColorSpaceMat2[2]);
        *green = MUL(luma, constHueColorSpaceMat2[3]) +
            MUL(chromaCb, constHueColorSpaceMat2[4]) +
            MUL(chromaCr, constHueColorSpaceMat2[5]);
        *blue = MUL(luma, constHueColorSpaceMat2[6]) +
            MUL(chromaCb, constHueColorSpaceMat2[7]) +
            MUL(chromaCr, constHueColorSpaceMat2[8]);

    }

    __device__ unsigned char clip_v(int x, int min_val, int  max_val) {
        if (x>max_val) {
            return max_val;
        }
        else if (x<min_val) {
            return min_val;
        }
        else {
            return x;
        }
    }

        // CUDA kernel for outputing the final RGB output from NV12;

    extern "C"
        __global__ void CUDAToBGR_drvapi(uint32 *dataY, uint32 *dataUV, size_t pitchY, size_t pitchUV, unsigned char *dstImage, int width, int height)
    {

        int32 x, y;

        // Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread
        x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);
        y = blockIdx.y *  blockDim.y + threadIdx.y;

        if (x >= width)
        {
            return; 
        }

        if (y >= height)
        {
            return; 
        }

        uint32 yuv101010Pel[2];
        uint8 *srcImageU8_Y = (uint8 *)dataY;
        uint8 *srcImageU8_UV = (uint8 *)dataUV;

        // Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way.
        // if we move to texture we could read 4 luminance values
        yuv101010Pel[0] = (srcImageU8_Y[y * pitchY + x]) << 2;
        yuv101010Pel[1] = (srcImageU8_Y[y * pitchY + x + 1]) << 2;

        int32 y_chroma = y >> 1;

        if (y & 1)  // odd scanline ?
        {
            uint32 chromaCb;
            uint32 chromaCr;

            chromaCb = srcImageU8_UV[y_chroma * pitchUV + x];
            chromaCr = srcImageU8_UV[y_chroma * pitchUV + x + 1];

            if (y_chroma < ((height >> 1) - 1)) // interpolate chroma vertically
            {
                chromaCb = (chromaCb + srcImageU8_UV[(y_chroma + 1) * pitchUV + x] + 1) >> 1;
                chromaCr = (chromaCr + srcImageU8_UV[(y_chroma + 1) * pitchUV + x + 1] + 1) >> 1;
            }

            yuv101010Pel[0] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2));
            yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));

            yuv101010Pel[1] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2));
            yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
        }
        else
        {
            yuv101010Pel[0] |= ((uint32)srcImageU8_UV[y_chroma * pitchUV + x] << (COLOR_COMPONENT_BIT_SIZE + 2));
            yuv101010Pel[0] |= ((uint32)srcImageU8_UV[y_chroma * pitchUV + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));

            yuv101010Pel[1] |= ((uint32)srcImageU8_UV[y_chroma * pitchUV + x] << (COLOR_COMPONENT_BIT_SIZE + 2));
            yuv101010Pel[1] |= ((uint32)srcImageU8_UV[y_chroma * pitchUV + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
        }

        // this steps performs the color conversion
        uint32 yuvi[6];
        float red[2], green[2], blue[2];

        yuvi[0] = (yuv101010Pel[0] & COLOR_COMPONENT_MASK);
        yuvi[1] = ((yuv101010Pel[0] >> COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
        yuvi[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);

        yuvi[3] = (yuv101010Pel[1] & COLOR_COMPONENT_MASK);
        yuvi[4] = ((yuv101010Pel[1] >> COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
        yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);

        // YUV to RGB Transformation conversion
        YUV2RGB2(&yuvi[0], &red[0], &green[0], &blue[0]);
        YUV2RGB2(&yuvi[3], &red[1], &green[1], &blue[1]);


        dstImage[y * width * 3 + x * 3] = clip_v(blue[0] * 0.25,0 ,255);
        dstImage[y * width * 3 + x * 3 + 3] = clip_v(blue[1] * 0.25,0, 255);

        dstImage[width * y * 3 + x * 3 + 1] = clip_v(green[0] * 0.25,0 ,255);
        dstImage[width * y * 3 + x * 3 + 4] = clip_v(green[1] * 0.25,0, 255);

        dstImage[width * y * 3 + x * 3 + 2] = clip_v(red[0] * 0.25, 0, 255);
        dstImage[width * y * 3 + x * 3 + 5] = clip_v(red[1] * 0.25,0 ,255);
    }

    cudaError_t setColorSpace2(e_ColorSpace CSC, float hue)
    {

        float hueSin = sin(hue);
        float hueCos = cos(hue);

        float hueCSC[9];
        if (CSC == ITU601)
        {
            //CCIR 601
            hueCSC[0] = 1.1644f;
            hueCSC[1] = hueSin * 1.5960f;
            hueCSC[2] = hueCos * 1.5960f;
            hueCSC[3] = 1.1644f;
            hueCSC[4] = (hueCos * -0.3918f) - (hueSin * 0.8130f);
            hueCSC[5] = (hueSin *  0.3918f) - (hueCos * 0.8130f);
            hueCSC[6] = 1.1644f;
            hueCSC[7] = hueCos *  2.0172f;
            hueCSC[8] = hueSin * -2.0172f;
        }
        else if (CSC == ITU709)
        {
            //CCIR 709
            hueCSC[0] = 1.0f;
            hueCSC[1] = hueSin * 1.57480f;
            hueCSC[2] = hueCos * 1.57480f;
            hueCSC[3] = 1.0;
            hueCSC[4] = (hueCos * -0.18732f) - (hueSin * 0.46812f);
            hueCSC[5] = (hueSin *  0.18732f) - (hueCos * 0.46812f);
            hueCSC[6] = 1.0f;
            hueCSC[7] = hueCos *  1.85560f;
            hueCSC[8] = hueSin * -1.85560f;
        }

        cudaError_t cudaStatus = cudaMemcpyToSymbol(constHueColorSpaceMat2, hueCSC, 9 * sizeof(float), 0, cudaMemcpyHostToDevice);
        float tmpf[9];
        memset(tmpf, 0, 9 * sizeof(float));
        cudaMemcpyFromSymbol(tmpf, constHueColorSpaceMat2, 9 * sizeof(float), 0, ::cudaMemcpyDefault);
        cudaDeviceSynchronize();

        if (cudaStatus != cudaSuccess) {
            fprintf(stderr, "cudaMemcpyToSymbol failed: %s\n", cudaGetErrorString(cudaStatus));
        }

        return cudaStatus;
    }

    cudaError_t CUDAToBGR(CUdeviceptr dataY, CUdeviceptr dataUV, size_t pitchY, size_t pitchUV, unsigned char* d_dstRGB, int width, int height)
    {
        dim3 block(32, 16, 1);
        dim3 grid((width + (2 * block.x - 1)) / (2 * block.x), (height + (block.y - 1)) / block.y, 1);
        CUDAToBGR_drvapi << < grid, block >> >((uint32 *)dataY, (uint32 *)dataUV, pitchY, pitchUV, d_dstRGB, width, height);
        cudaError_t cudaStatus = cudaGetLastError();
        if (cudaStatus != cudaSuccess) {
            fprintf(stderr, "NV12ToRGB_drvapi launch failed: %s\n", cudaGetErrorString(cudaStatus));
            return cudaStatus;
        }

        cudaStatus = cudaDeviceSynchronize();
        if (cudaStatus != cudaSuccess) {
            fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching NV12ToRGB_drvapi !\n", cudaStatus);
            return cudaStatus;
        }

        return cudaStatus;
    }
}

对外接口为 CUDAToBGR 函数，核心实现在 CUDAToBGR_drvapi 函数中。

其中输入参数把Y和UV分开了，其原因是ffmpeg对 AV_PIX_FMT_CUDA 格式数据，在AVFrame中data[0]和data[1]都存在值，AV_PIX_FMT_CUDA 数据其实应该就是显卡上的NV12数据，所以个人推测data[0]是Y数据，data[1]是UV数据。实际转换的效果基本证明这个推测是对的。

调用以实现转换：

if (gpuFrame->format == AV_PIX_FMT_CUDA)
{
     cudaError_t cudaStatus;
    if(pHwRgb == nullptr){
        cudaStatus = cudaMalloc((void **)&pHwRgb, 3 * gpuFrame->width * gpuFrame->height * sizeof(unsigned char));
    }
    cudaStatus = cuda_common::CUDAToBGR((CUdeviceptr)gpuFrame->data[0],(CUdeviceptr)gpuFrame->data[1], gpuFrame->linesize[0], gpuFrame->linesize[1], pHwRgb, gpuFrame->width, gpuFrame->height);
    cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        cout << "CUDAToBGR failed !!!" << endl;
        return;
    }
    // saveJpeg("/home/cmhu/FFNvDecoder/a.jpg", pHwRgb, gpuFrame->width, gpuFrame->height);  // 验证 CUDAToRGB 
}

其中 pHwRgb 是unsigned char* ,用cudaMalloc分配的在显卡上的数据。 gpuFrame 是 AVFrame *，是用cuvid硬解出来的帧数据。

转换后的数据保存下来的jpeg图片样例：

可以看到效果良好。

标签：FMT,cudaStatus,gpuFrame,PIX,RGB,MUL,constHueColorSpaceMat2,chromaCr,chromaCb
From： https://www.cnblogs.com/betterwgo/p/16712604.html

相关文章

赞助商

阅读排行