问题来源:
1. 需要对输入图像中的一个环形区域,进行极坐标逆变换,将该环形区域转换为一张新的矩形图像
2. opencv没有直接对环形区域图像进行变换的函数,需要通过循环遍历的方式,利用polarToCart进行转换
3. 循环遍历不可避免的带来速度上的问题,尤其是图片较大时
解决思路
1:使用opencv自带的多线程parallel_for_进行加速
2:利用cuda编程在gpu中加速
3:为了便于在cuda中实现相应功能,坐标转换的相关代码根据原理手动实现
一、利用opencv的parallel_for_多线程运算
void polar2cart_cv(cv::Mat& mat1, cv::Mat& mat2,
const cv::Point2f& center, const float& min_radius,
const float& max_radius, const float& min_theta, const float& max_theta)
{
//转换后图像的行即圆环的半径差,列数即最大半径对应的弧长
int rows_c = static_cast<int>(std::ceil(max_radius - min_radius));
int cols_c = static_cast<int>(std::ceil((max_theta - min_theta) * max_radius));
mat2 = cv::Mat::zeros(rows_c, cols_c, CV_8UC1);
// 极坐标转换的角度步长
float rstep = 1.0;
float thetaStep = (max_theta - min_theta) / cols_c;
float center_polar_x = center.x;
float center_polar_y = center.y;
cv::parallel_for_(cv::Range(0, rows_c * cols_c), [&](const cv::Range& range) {
for (int r = range.start; r < range.end; r++)
{
int row = r / cols_c;
int col = r % cols_c;
float temp_r = min_radius + row * rstep;
float theta_p = min_theta + col * thetaStep;
float sintheta = std::sin(theta_p);
float costheta = std::cos(theta_p);
int polar_x = static_cast<int>(center_polar_x + temp_r * sintheta);
int polar_y = static_cast<int>(center_polar_y + temp_r * costheta);
if (polar_x >= 0 && polar_x < mat1.cols && polar_y >= 0 && polar_y < mat1.rows)
{
mat2.at<uchar>(rows_c - 1 - row, col) = mat1.at<uchar>(polar_y, polar_x);
}
}
});
}
二、利用CUDA编程在GPU中并行运算
void polar2cart_cuda(cv::Mat& mat1, cv::Mat& mat2,
const cv::Point2f& center, const float& min_radius,
const float& max_radius, const float& min_theta, const float& max_theta)
{
int rows_c = static_cast<int>(std::ceil(max_radius - min_radius));
int cols_c = static_cast<int>(std::ceil((max_theta - min_theta) * max_radius));
mat2 = cv::Mat::zeros(rows_c, cols_c, CV_8UC1);
float rstep = 1.0;
float thetaStep = (max_theta - min_theta) / cols_c;
float center_polar_x = center.x;
float center_polar_y = center.y;
uchar* deviceMat1Data;
cudaMalloc((void**)&deviceMat1Data, mat1.rows * mat1.cols * sizeof(uchar));
uchar* deviceMat2Data;
cudaMalloc((void**)&deviceMat2Data, mat2.rows * mat2.cols * sizeof(uchar));
//图像 cpu -> gpu
cudaMemcpy(deviceMat1Data, mat1.data, mat1.rows * mat1.cols * sizeof(uchar), cudaMemcpyHostToDevice);
cudaMemcpy(deviceMat2Data, mat2.data, mat2.rows * mat2.cols * sizeof(uchar), cudaMemcpyHostToDevice);
int maxRows = std::max(mat1.rows, mat2.rows);
int maxCols = std::max(mat1.cols, mat2.cols);
dim3 blockSize(32, 32);
dim3 gridSize((maxCols + blockSize.x - 1) / blockSize.x, (maxRows + blockSize.y - 1) / blockSize.y);
process<<<gridSize, blockSize>>>(deviceMat1Data, mat1.rows, mat1.cols, deviceMat2Data, mat2.rows, mat2.cols,1,1, min_radius,min_theta,thetaStep,center_polar_x,center_polar_y);
//结果图像 gpu -> cpu
cudaMemcpy(mat2.data, deviceMat2Data, mat2.rows * mat2.cols * sizeof(uchar), cudaMemcpyDeviceToHost);
cudaFree(deviceMat1Data);
cudaFree(deviceMat2Data);
}
gpu中进行极坐标逆变换的核心函数
__global__ void process(uchar* mat1Data, int mat1Rows, int mat1Cols, uchar* mat2Data, int mat2Rows, int mat2Cols,int channels1,int channels2,float min_radius,float min_theta,float thetaStep,float center_polar_x,float center_polar_y)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < mat2Rows && col < mat2Cols) {
float theta_p = min_theta + col * thetaStep;
float sintheta = std::sin(theta_p);
float costheta = std::cos(theta_p);
//每个极坐标系下的点,在原图(直角坐标系)下的坐标
float temp_r = min_radius + row*1;
int src_col = static_cast<int>(center_polar_x + temp_r * sintheta);
int src_row = static_cast<int>(center_polar_y + temp_r * costheta);
// 结果图排放时的坐标(row的定义是沿着半径增加的方向增加,与图像的索引dst_row方向相反。col则与dst_col一致)
int dst_row = mat2Rows-1-row;
int dst_col = col;
if (src_col >= 0 && src_col < mat1Cols && src_row >= 0 && src_row < mat1Rows)
{
//每个点对应在原图(极坐标系)的索引
int srcidex= (src_row*mat1Cols+ src_col)*channels1;
//每个点对应在结果图(直角坐标系)的索引
int dstIdex = (dst_row*mat2Cols+ dst_col)*channels2;
mat2Data[dstIdex] = mat1Data[srcidex];
}
}
}
三、测试与结论
1.测试代码
#include <iostream>
#include <opencv2/opencv.hpp>
#include <cuda_runtime.h>
using namespace std::chrono;
#define PI 2 * std::acos(0.0)
#include "cuda_fun_api/polar2cart.h"
int main()
{
cv::Mat ori_image = cv::imread("..\\..\\1.png");
cv::Mat ori_gray;
cv::cvtColor(ori_image, ori_gray, cv::COLOR_BGR2GRAY);
int i = 0;
cv::Mat output_image;
float sum_time = 0;
int min_radius = 680;
int max_radius = 1330;
cv::Point2f circle_center = cv::Point2f(2102,1539);
while(i < 20)
{
double t1 = cv::getTickCount();
// opencv的多线程
// cuda_fun_api::polar2cart_cv(ori_gray.clone(), output_image, circle_center, min_radius, max_radius, 0, 2 * PI);
// cuda
cuda_fun_api::polar2cart_cuda(ori_gray.clone(), output_image, circle_center, min_radius, max_radius, 0, 2 * PI);
double t2 = cv::getTickCount();
//剔除第一张的运行时间(在gpu开辟内存会比较耗时)
std::cout << (t2 - t1) * 1000 / cv::getTickFrequency() << std::endl;
if (i > 0)
sum_time += (t2 - t1) * 1000 / cv::getTickFrequency();
i++;
}
std::cout << "average:" <<sum_time/19 << std::endl;
// cv::namedWindow("ori_image",0);
// cv::circle(ori_image,circle_center,min_radius,cv::Scalar(0,0,255),10);
// cv::circle(ori_image,circle_center,max_radius,cv::Scalar(0,0,255),10);
// cv::resizeWindow("ori_image", ori_image.cols/5, ori_image.rows/5);
// cv::imshow("ori_image", ori_image);
// cv::namedWindow("output_image",0);
// cv::resizeWindow("output_image", output_image.cols/5, output_image.rows/5);
// cv::imshow("output_image", output_image);
// cv::waitKey(0);
return 0;
}
2.测试结果
对如下图像的环境区域进行极坐标转换,图像大小为1200万像素,循环测试20次。
原图:
极坐标逆变换后:
parallel_for_方式: CUDA方式:
3.结论
CUDA的方式会在第一张耗时较久(GPU中开辟内存),但整体的运行效率远高于parallel_for_方式。
CUDA方式所能提升的效率和图像大小有关,图像越大,计算量越大,提升的效率越高。
四、工程配置
本文将cuda相关的函数封装到cuda_function_api库中,与主程序解耦
find_package(CUDA)
find_package(cuBLAS)
set(CUDA_SOURCE_FILES src/cuda_fun_api/polar2cart.cu)
cuda_add_library(cuda_function_api ${CUDA_SOURCE_FILES} STATIC)
target_link_libraries(cuda_function_api ${OpenCV_LIBS} ${CUBLAS_LIBRARIES})
FILE(GLOB SOURCES src/main.cpp)
ADD_EXECUTABLE(main ${SOURCES})
TARGET_LINK_LIBRARIES(main ${OpenCV_LIBS} cuda_function_api)
target_compile_options(main PRIVATE "/MT")