改写自:https://zhuanlan.zhihu.com/p/451101452,该用零拷贝方案。
1、host函数
1 #include <CL/cl.h> 2 #include <stdio.h> 3 #include <stdlib.h> 4 #include <string.h> 5 #include <math.h> 6 #include <stdbool.h> 7 //#include <fstream> 8 //#include <string> 9 //#include <iostream> 10 11 #ifndef _WIN32 12 #include <sys/time.h> 13 #else 14 #include <time.h> 15 #include <windows.h> 16 #endif 17 18 #define HKA_EPS_F32 ( 1.192092890e-07f ) // 单精度浮点标准中最小差值 19 //typedef unsigned __int64 UINT64; 20 21 typedef unsigned long long UINT64; 22 typedef struct _HKA_POINT_I 23 { 24 int x; // x坐标 25 int y; // y坐标 26 }HKA_POINT_I; 27 //cl_int ConvertToString(const char* pFileName, std::string& str); 28 int AVMF_inv_mat(float* a); 29 int AVMF_STITCH_get_rotate_matrix(float rotatematrix[3][3], float angle, HKA_POINT_I center); 30 31 int sys_time_get_utc_msec(UINT64* puUtcMSec); 32 #if 0 33 template <typename T> 34 void check(T result, char const *const func, const char *const file, int const line) 35 { 36 if (result) 37 { 38 fprintf(stderr, "CL error at %s:%d code=%d \"%s\" \n", file, line, 39 static_cast<unsigned int>(result), func); 40 exit(result); 41 } 42 } 43 44 #define CHECK_OPENCL_ERROR(val) check((val), #val, __FILE__, __LINE__) 45 #endif 46 void DataInit(cl_uchar *p_data, int width, int height); 47 void DataCompare(cl_uchar *src1, cl_uchar *src2, int width, int height); 48 49 cl_context CreateContext(cl_device_id *p_device); 50 cl_command_queue CreateCommandQueue(cl_context context, cl_device_id device); 51 cl_program CreateProgram(cl_context context, cl_device_id device, const char *source); 52 cl_kernel CreateKernel(cl_program program, const char *kernel_name, cl_device_id device); 53 void PrintProfilingInfo(cl_event event); 54 55 bool CreateMemObject(cl_context context, cl_mem memobject[2], cl_uchar *img_ptr, 56 cl_uint image_size); 57 void CleanUp(cl_context context, cl_command_queue commandqueue, cl_program program, 58 cl_kernel kernel); 59 60 void CpuTranspose(cl_uchar *src, cl_uchar *dst, int src_width, int src_height, int antiRotateMatrixInt[9]); 61 void PrintMatrix(cl_uchar *matrix, int width, int height); 62 63 char *ClUtilReadFileToString(const char *filename); 64 void ClUtilWriteStringToFile(const cl_uchar *text, size_t text_length, char *filename); 65 //void PrintDuration(timeval *start, const char *str, int loop_count); 66 void CheckClStatus(cl_int ret, const char *failure_msg); 67 68 int main() 69 { 70 cl_device_id device; 71 cl_context context; 72 cl_command_queue command_queue; 73 cl_program program; 74 cl_kernel kernel; 75 cl_mem buffer_src; 76 cl_mem buffer_dst; 77 cl_int err_num = CL_SUCCESS; 78 cl_uint bufferSizeInBytes; 79 cl_int iStatus = 0; // 函数返回状态 80 //string strSource = ""; // 用于存储cl文件中的代码 81 cl_mem buffer_matrix = NULL; 82 //timeval start; 83 UINT64 t1, t2; 84 FILE *fInput = NULL; 85 FILE *fOutput = NULL; 86 const int c_loop_count = 10; 87 88 /* 图像参数 */ 89 int width = 1280; 90 int height = 960; 91 bufferSizeInBytes = width * height * sizeof(cl_uchar); 92 cl_uchar *hostSrcBuffer = (cl_uchar *)malloc(bufferSizeInBytes); 93 cl_uchar *cpuDstBuffer = (cl_uchar *)malloc(bufferSizeInBytes); 94 //cl_uchar *gpuDstBuffer = (cl_uchar *)malloc(bufferSizeInBytes); 95 //memset(gpuDstBuffer, 0, bufferSizeInBytes); 96 97 //旋转参数 98 float rotatematrix[3][3] = { {0.0f,0.0f,0.0f} };/* 旋转矩阵 */ 99 HKA_POINT_I center = { width / 2,height / 2 }; /* 旋转中心 */ 100 float antiRotateMatrix[9]; /* 旋转矩阵逆矩阵 */ 101 int antiRotateMatrixInt[9]; /* 旋转矩阵逆矩阵整数化 */ 102 AVMF_STITCH_get_rotate_matrix(rotatematrix, -3.1415926 / 4, center); 103 memcpy((void*)antiRotateMatrix, (void*)rotatematrix, sizeof(rotatematrix)); 104 /* 求逆矩阵 */ 105 if (AVMF_inv_mat(antiRotateMatrix) < 0) 106 { 107 printf("AVMF_inv_mat err \n"); 108 } 109 /* 整数化 */ 110 int index = 0; 111 for (index = 0; index < 9; index++) 112 { 113 antiRotateMatrixInt[index] = (int)(antiRotateMatrix[index] * (1 << 8)); 114 } 115 antiRotateMatrixInt[2] += (1 << (8 - 1)); 116 antiRotateMatrixInt[5] += (1 << (8 - 1)); 117 118 119 120 121 // Step 1-3 查询平台设备并创建context 122 context = CreateContext(&device); 123 if (NULL == context) 124 { 125 printf("MainError:Create Context Failed!\n"); 126 return -1; 127 } 128 129 // Step 4 创建command queue 130 command_queue = CreateCommandQueue(context, device); 131 if (NULL == command_queue) 132 { 133 printf("MainError:Create CommandQueue Failed!\n"); 134 return -1; 135 } 136 137 // 读取OpenCL C源代码 138 char *device_source_str = ClUtilReadFileToString("kerneltest.cl"); 139 if (NULL == device_source_str) 140 { 141 printf("MainError:ClUtilReadFileToString Failed!\n"); 142 return -1; 143 } 144 program = CreateProgram(context, device, device_source_str); 145 if (NULL == program) 146 { 147 printf("MainError:Create Program Failed!\n"); 148 return -1; 149 } 150 151 // Step 6 创建编译kernel 152 kernel = CreateKernel(program, "image_rotate_matrix", device); 153 if (NULL == kernel) 154 { 155 printf("MainError:Create Kernel Failed!\n"); 156 return -1; 157 } 158 159 // Step 7 创建内存对象 160 #if 0 161 /* 传统方案 */ 162 sys_time_get_utc_msec(&t1); 163 buffer_src = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 164 bufferSizeInBytes, hostSrcBuffer, &err_num); 165 CheckClStatus(err_num, "Create src buffer"); 166 buffer_dst = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bufferSizeInBytes, NULL, &err_num); 167 CheckClStatus(err_num, "Create dst buffer"); 168 buffer_matrix = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 169 sizeof(antiRotateMatrixInt), antiRotateMatrixInt, &err_num); 170 CheckClStatus(err_num, "Create matrix buffer"); 171 172 sys_time_get_utc_msec(&t2); 173 printf(" clCreateBuffer cost %llu \n", (t2 - t1)); 174 #endif 175 176 /* ZERO COPY方案 */ 177 sys_time_get_utc_msec(&t1); 178 buffer_src = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, bufferSizeInBytes, NULL, &err_num); 179 CheckClStatus(err_num, "Create src buffer"); 180 buffer_dst = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, bufferSizeInBytes, NULL, &err_num); 181 CheckClStatus(err_num, "Create dst buffer"); 182 /* 旋转矩阵暂不使用零拷贝方案 */ 183 buffer_matrix = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(antiRotateMatrixInt), antiRotateMatrixInt, &err_num); 184 CheckClStatus(err_num, "Create matrix buffer"); 185 186 sys_time_get_utc_msec(&t2); 187 printf(" clCreateBuffer cost %llu \n", (t2 - t1)); 188 189 /* 建立输入数据显存到内存的映射 */ 190 sys_time_get_utc_msec(&t1); 191 cl_uchar *hostPtrSrc = (cl_uchar *)clEnqueueMapBuffer( 192 command_queue, 193 buffer_src, 194 CL_TRUE, 195 CL_MAP_WRITE, 196 0, 197 bufferSizeInBytes, 198 0, NULL, NULL, &iStatus); 199 sys_time_get_utc_msec(&t2); 200 201 printf(" clEnqueueWriteBuffer cost %llu %d \n", (t2 - t1), iStatus); 202 203 sys_time_get_utc_msec(&t1); 204 /* 建立输出数据显存到内存的映射 */ 205 cl_uchar *hostPtrDst = (cl_uchar *)clEnqueueMapBuffer( 206 command_queue, 207 buffer_dst, 208 CL_TRUE, 209 CL_MAP_WRITE, 210 0, 211 bufferSizeInBytes, 212 0, NULL, NULL, &iStatus); 213 sys_time_get_utc_msec(&t2); 214 215 // Step 8 设置kernelArg 216 err_num = clSetKernelArg(kernel, 0, sizeof(cl_mem), &buffer_src); 217 err_num |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &buffer_dst); 218 err_num |= clSetKernelArg(kernel, 2, sizeof(int), &width); 219 err_num |= clSetKernelArg(kernel, 3, sizeof(int), &height); 220 err_num |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &buffer_matrix); 221 CheckClStatus(err_num, "Set Kernel Arg"); 222 223 size_t global_work_size[3]; 224 size_t local_work_size[3]; 225 226 // 设置NDRange尺寸 227 #if defined(QCOM_DEVICE) 228 local_work_size[0] = 32; 229 local_work_size[1] = 32; 230 #elif defined(MTK_DEVICE) 231 local_work_size[0] = 16; 232 local_work_size[1] = 16; 233 #else 234 local_work_size[0] = 16; 235 local_work_size[1] = 16; 236 #endif 237 local_work_size[2] = 0; 238 239 global_work_size[0] = 240 (width + local_work_size[0] - 1) / local_work_size[0] * local_work_size[0]; 241 global_work_size[1] = 242 (height + local_work_size[1] - 1) / local_work_size[1] * local_work_size[1]; 243 global_work_size[2] = 0; 244 245 printf("global_work_size=(%zu,%zu)\n", global_work_size[0], global_work_size[1]); 246 printf("local_work_size=(%zu,%zu)\n", local_work_size[0], local_work_size[1]); 247 248 249 fInput = fopen("./input/test_in.nv12", "r"); 250 if (NULL == fInput || NULL == hostSrcBuffer) 251 { 252 printf("open file failed \n"); 253 return -1; 254 } 255 //fread(hostSrcBuffer, sizeof(char), bufferSizeInBytes, fInput); 256 fread((void *)hostPtrSrc, sizeof(char), bufferSizeInBytes, fInput); 257 fclose(fInput); 258 printf("Matrix Width =%d Height=%d\n", width, height); 259 sys_time_get_utc_msec(&t1); 260 for (int i = 0; i < c_loop_count; i++) 261 { 262 CpuTranspose(hostPtrSrc, cpuDstBuffer, width, height, antiRotateMatrixInt); 263 } 264 sys_time_get_utc_msec(&t2); 265 printf(" CpuTranspose cost %llu \n", (t2 - t1) / c_loop_count); 266 /* 存储CPU数据 */ 267 fOutput = fopen("./output/cpu_test_out.nv12", "wb"); 268 if (NULL == fOutput) 269 { 270 printf("open file failed \n"); 271 return -1; 272 } 273 fwrite(cpuDstBuffer, sizeof(char), bufferSizeInBytes, fOutput); 274 fclose(fOutput); 275 276 //fseek(fInput, 0, SEEK_SET); 277 //fread((void *)hostPtrSrc, sizeof(char), bufferSizeInBytes, fInput); 278 //fclose(fInput); 279 280 //gettimeofday(&start, NULL); 281 for (int i = 0; i < c_loop_count; i++) 282 { 283 // Step 9 入队kernel执行 284 cl_event kernel_event = NULL; 285 err_num = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global_work_size, 286 local_work_size, 0, NULL, &kernel_event); 287 CheckClStatus(err_num, "ClEnqueueNDRangeKernel"); 288 // Step 10 同步 289 err_num = clWaitForEvents(1, &kernel_event); 290 CheckClStatus(err_num, "ClWaitForEvents"); 291 PrintProfilingInfo(kernel_event); 292 clReleaseEvent(kernel_event); 293 sleep(1); 294 } 295 296 /* 比较CPU数据与GPU数据是否有差异 */ 297 DataCompare(cpuDstBuffer, hostPtrDst, width, height); 298 299 /* 存储GPU数据 */ 300 fOutput = fopen("./output/gpu_test_out.nv12", "wb"); 301 if (NULL == fOutput) 302 { 303 printf("open file failed \n"); 304 return -1; 305 } 306 fwrite((void*)hostPtrDst, sizeof(char), bufferSizeInBytes, fOutput); 307 fclose(fOutput); 308 309 iStatus = clEnqueueUnmapMemObject( 310 command_queue, 311 buffer_src, 312 (void *) hostPtrSrc, 313 0, NULL, NULL); 314 iStatus = clEnqueueUnmapMemObject( 315 command_queue, 316 buffer_dst, 317 (void *) hostPtrDst, 318 0, NULL, NULL); 319 free(device_source_str); 320 free(hostSrcBuffer); 321 free(cpuDstBuffer); 322 //free(gpuDstBuffer); 323 clReleaseMemObject(buffer_src); 324 clReleaseMemObject(buffer_dst); 325 326 CleanUp(context, command_queue, program, kernel); 327 return 0; 328 329 } 330 331 void DataInit(cl_uchar *p_data, int width, int height) 332 { 333 cl_uchar cnt = 0; 334 for (int i = 0; i < width * height; i++) 335 { 336 *p_data = cnt; 337 cnt++; 338 p_data++; 339 } 340 } 341 342 void PrintMatrix(cl_uchar *matrix, int width, int height) 343 { 344 for (int i = 0; i < height; i++) 345 { 346 for (int j = 0; j < width; j++) 347 { 348 printf("%d ", matrix[i * width + j]); 349 } 350 printf("\n"); 351 } 352 } 353 354 void CpuTranspose(cl_uchar *src, cl_uchar *dst, int src_width, int src_height, int matrix[9]) 355 { 356 for (int src_row = 0; src_row < src_height; src_row++) 357 { 358 for (int src_col = 0; src_col < src_width; src_col++) 359 { 360 int xpos = matrix[0] * src_col + matrix[1] * src_row + matrix[2]; 361 int ypos = matrix[3] * src_col + matrix[4] * src_row + matrix[5]; 362 363 xpos = (xpos >> 8); 364 ypos = (ypos >> 8); 365 if ((xpos >= 0) && (xpos < src_width) && (ypos >= 0) && (ypos < src_height)) //Bound Checking 366 { 367 dst[src_row * src_width + src_col] = src[ypos * src_width + xpos]; 368 } 369 } 370 } 371 } 372 373 cl_context CreateContext(cl_device_id *p_device) 374 { 375 cl_int err_num; 376 cl_uint num_platform; 377 cl_platform_id platform_id; 378 cl_context context = NULL; 379 err_num = clGetPlatformIDs(1, &platform_id, &num_platform); 380 if (CL_SUCCESS != err_num || num_platform <= 0) 381 { 382 printf("failed to find any opencl platform. \n"); 383 return NULL; 384 } 385 err_num = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, p_device, NULL); 386 if (CL_SUCCESS != err_num) 387 { 388 printf("there is no gpu.\n"); 389 return NULL; 390 } 391 context = clCreateContext(NULL, 1, p_device, NULL, NULL, &err_num); 392 if (CL_SUCCESS != err_num) 393 { 394 printf("create context error.\n"); 395 return NULL; 396 } 397 return context; 398 } 399 400 cl_command_queue CreateCommandQueue(cl_context context, cl_device_id device) 401 { 402 cl_command_queue_properties queue_prop[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0}; 403 cl_command_queue command_queue = NULL; 404 command_queue = clCreateCommandQueueWithProperties(context, device, queue_prop, NULL); 405 if (NULL == command_queue) 406 { 407 printf("create command queue failed.\n"); 408 } 409 return command_queue; 410 } 411 412 cl_program CreateProgram(cl_context context, cl_device_id device, const char *source) 413 { 414 cl_int err_num; 415 cl_program program; 416 417 program = clCreateProgramWithSource(context, 1, (const char **)&source, NULL, NULL); 418 if (NULL == program) 419 { 420 printf("create program failed.\n "); 421 return NULL; 422 } 423 err_num = clBuildProgram(program, 1, &device, NULL, NULL, NULL); 424 if (CL_SUCCESS != err_num) 425 { 426 clReleaseProgram(program); 427 return NULL; 428 } 429 return program; 430 } 431 432 cl_kernel CreateKernel(cl_program program, const char *kernel_name, cl_device_id device) 433 { 434 int err_num; 435 cl_kernel kernel; 436 kernel = clCreateKernel(program, kernel_name, &err_num); 437 if (err_num != CL_SUCCESS) 438 { 439 printf("create kernel failed.\n "); 440 return NULL; 441 } 442 size_t max_work_group_size; 443 size_t perferred_work_group_size_multiple; 444 err_num = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), 445 &max_work_group_size, NULL); 446 err_num |= 447 clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, 448 sizeof(size_t), &perferred_work_group_size_multiple, NULL); 449 if (err_num != CL_SUCCESS) 450 { 451 printf("Get kernel info failed.\n "); 452 return NULL; 453 } 454 printf("Kernel %s max workgroup size=%zu\n", kernel_name, max_work_group_size); 455 printf("Kernel %s perferred workgroup size multiple=%zu\n", kernel_name, 456 perferred_work_group_size_multiple); 457 return kernel; 458 } 459 460 void PrintProfilingInfo(cl_event event) 461 { 462 cl_ulong t_queued; 463 cl_ulong t_submitted; 464 cl_ulong t_started; 465 cl_ulong t_ended; 466 cl_ulong t_completed; 467 468 clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &t_queued, NULL); 469 clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &t_submitted, 470 NULL); 471 clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &t_started, NULL); 472 clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &t_ended, NULL); 473 clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &t_completed, 474 NULL); 475 476 printf("queue -> submit : %fus\n", (t_submitted - t_queued) * 1e-3); 477 printf("submit -> start : %fus\n", (t_started - t_submitted) * 1e-3); 478 printf("start -> end : %fus\n", (t_ended - t_started) * 1e-3); 479 printf("end -> finish : %f %f %fus\n", t_completed * 1e-3, t_ended * 1e-3, (t_completed - t_ended) * 1e-3); 480 } 481 482 void CleanUp(cl_context context, cl_command_queue commandqueue, cl_program program, 483 cl_kernel kernel) 484 { 485 486 if (NULL != kernel) 487 { 488 clReleaseKernel(kernel); 489 } 490 if (NULL != program) 491 { 492 clReleaseProgram(program); 493 } 494 if (NULL != commandqueue) 495 { 496 clReleaseCommandQueue(commandqueue); 497 } 498 if (NULL != context) 499 { 500 clReleaseContext(context); 501 } 502 } 503 504 void CheckClStatus(cl_int ret, const char *failure_msg) 505 { 506 if (ret != CL_SUCCESS) 507 { 508 fprintf(stderr, "Error %d with %s\n", ret, failure_msg); 509 exit(ret); 510 } 511 return; 512 } 513 514 //void PrintDuration(timeval *begin, const char *function_name, int loop_count) 515 //{ 516 // timeval current; 517 // gettimeofday(¤t, NULL); 518 // uint64_t time_in_microseconds = 519 // (current.tv_sec - begin->tv_sec) * 1e6 + (current.tv_usec - begin->tv_usec); 520 // printf("%s consume average time: %ld us\n", function_name, time_in_microseconds / loop_count); 521 // return; 522 //} 523 // 将cl文件代码转为字符串 524 525 #if 0 526 cl_int ConvertToString(const char* pFileName, std::string& Str) 527 { 528 size_t uiSize = 0; 529 size_t uiFileSize = 0; 530 char* pStr = NULL; 531 532 /*FILE* pFile = fopen(pFileName, "r"); 533 if (!pFile) return -1; 534 fseek(pFile, 0L, SEEK_END); 535 int size = ftell(pFile); 536 fclose(pFile);*/ 537 538 539 std::fstream fFile(pFileName, (std::fstream::in | std::fstream::binary)); 540 541 if (fFile.is_open()) 542 { 543 fFile.seekg(0, std::fstream::end); 544 uiSize = uiFileSize = (size_t)fFile.tellg(); // 获得文件大小 545 fFile.seekg(0, std::fstream::beg); 546 pStr = new char[uiSize + 1]; 547 548 if (NULL == pStr) 549 { 550 fFile.close(); 551 return 0; 552 } 553 554 fFile.read(pStr, uiFileSize); // 读取uiFileSize字节 555 fFile.close(); 556 pStr[uiSize] = '\0'; 557 Str = pStr; 558 559 delete[] pStr; 560 561 return 0; 562 } 563 564 printf("Error: Failed to open cl file %s \n", pFileName); 565 566 return -1; 567 } 568 #endif 569 570 char *ClUtilReadFileToString(const char *filename) 571 { 572 573 FILE *fp; 574 char *fileData; 575 long fileSize; 576 577 /* Open the file */ 578 fp = fopen(filename, "rb"); 579 if (!fp) 580 { 581 printf("Could not open file: %s\n", filename); 582 exit(-1); 583 } 584 585 /* Determine the file size */ 586 if (fseek(fp, 0, SEEK_END)) 587 { 588 printf("Error reading the file\n"); 589 exit(-1); 590 } 591 fileSize = ftell(fp); 592 if (fileSize < 0) 593 { 594 printf("Error reading the file\n"); 595 exit(-1); 596 } 597 if (fseek(fp, 0, SEEK_SET)) 598 { 599 printf("Error reading the file\n"); 600 exit(-1); 601 } 602 603 /* Read the contents */ 604 fileData = (char *)malloc(fileSize + 1); 605 if (!fileData) 606 { 607 exit(-1); 608 } 609 if (fread(fileData, fileSize, 1, fp) != 1) 610 { 611 printf("Error reading the file\n"); 612 exit(-1); 613 } 614 615 /* Terminate the string */ 616 fileData[fileSize] = '\0'; 617 618 /* Close the file */ 619 if (fclose(fp)) 620 { 621 printf("Error closing the file\n"); 622 exit(-1); 623 } 624 625 return fileData; 626 } 627 628 void ClUtilWriteStringToFile(const cl_uchar *text, size_t text_length, char *filename) 629 { 630 FILE *fp = fopen(filename, "wt+"); 631 if (NULL == fp) 632 return; 633 fwrite(text, 1, text_length, fp); 634 fclose(fp); 635 } 636 637 void DataCompare(cl_uchar *src1, cl_uchar *src2, int width, int height) 638 { 639 for (int i = 0; i < height; i++) 640 { 641 for (int j = 0; j < width; j++) 642 { 643 int idx = i * width + j; 644 if (src1[idx] != src2[idx]) 645 { 646 printf("Mismatch at (%d,%d), A= %d,B= %d\n", i, j, src1[idx], src2[idx]); 647 return; 648 } 649 } 650 } 651 printf("A and B match!\n"); 652 return; 653 } 654 655 /*************************************************************************************************** 656 * 功 能:求3x3矩阵逆矩阵,使用伴随矩阵的方式直接计算 657 * 参 数: 658 * a - I/O 源矩阵 659 * 返回值:无 660 * 备 注: 661 ***************************************************************************************************/ 662 int AVMF_inv_mat(float* a) 663 { 664 float det = 0.0f; 665 float recip_det = 0.0f; 666 float a00 = 0.0f; 667 float a01 = 0.0f; 668 float a02 = 0.0f; 669 float a10 = 0.0f; 670 float a11 = 0.0f; 671 float a12 = 0.0f; 672 float a20 = 0.0f; 673 float a21 = 0.0f; 674 float a22 = 0.0f; 675 676 a00 = a[0]; 677 a01 = a[1]; 678 a02 = a[2]; 679 a10 = a[3]; 680 a11 = a[4]; 681 a12 = a[5]; 682 a20 = a[6]; 683 a21 = a[7]; 684 a22 = a[8]; 685 686 // 计算3x3矩阵行列式 687 det = a00 * a11 * a22 + a01 * a12 * a20 + a02 * a21 * a10; 688 det -= a02 * a11 * a20 + a00 * a21 * a12 + a01 * a10 * a22; 689 690 // 除零保护 691 if (HKA_EPS_F32 > fabs(det)) 692 { 693 printf("error \n"); 694 return -1; 695 } 696 697 recip_det = 1.0f / det; 698 699 // 根据伴随矩阵除行列式算出逆矩阵,伴随矩阵已转置 det不为0,不做除零保护 700 a[0] = (a11 * a22 - a21 * a12) * recip_det; 701 a[1] = -(a01 * a22 - a21 * a02) * recip_det; 702 a[2] = (a01 * a12 - a11 * a02) * recip_det; 703 a[3] = -(a10 * a22 - a20 * a12) * recip_det; 704 a[4] = (a00 * a22 - a20 * a02) * recip_det; 705 a[5] = -(a00 * a12 - a10 * a02) * recip_det; 706 a[6] = (a10 * a21 - a20 * a11) * recip_det; 707 a[7] = -(a00 * a21 - a20 * a01) * recip_det; 708 a[8] = (a00 * a11 - a10 * a01) * recip_det; 709 710 return 0; 711 } 712 713 /*************************************************************************************************** 714 * 功 能:生成2D全景图 715 * 参 数:* 716 * modu_enable - I 2D类型,融合使能开关-----heyunyun 717 * sub_img - I 各镜头拼接子图 718 * weight - I 各子图权重表 719 * car_img - I 车模图像 720 * dst - O 2D全景图 721 * 返回值:状态码 722 * 备 注: 723 ***************************************************************************************************/ 724 725 int AVMF_STITCH_get_rotate_matrix( 726 float rotatematrix[3][3], 727 float angle, 728 HKA_POINT_I center) 729 { 730 int i = 0, j = 0, k = 0; 731 float temp = 0.0f; 732 733 float translate_l[3][3] = { { 1.0f, 0.0f, center.x }, { 0.0f, 1.0f, center.y }, { 0.0f, 0.0f, 1.0f } }; 734 float rotate[3][3] = { { cos(angle), -sin(angle), 0.0f }, { sin(angle), cos(angle), 0.0f }, { 0.0f, 0.0f, 1.0f } }; 735 float translate_r[3][3] = { { 1.0f, 0.0f, -center.x }, { 0.0f, 1.0f, -center.y }, { 0.0f, 0.0f, 1.0f } }; 736 737 float ans[3][3] = { {0.0f, 0.0f, 0.0f} }; 738 739 for (i = 0; i < 3; ++i) 740 { 741 for (j = 0; j < 3; ++j) 742 { 743 temp = 0.0f; 744 for (k = 0; k < 3; ++k) 745 { 746 temp += translate_l[i][k] * rotate[k][j]; 747 } 748 ans[i][j] = temp; 749 } 750 } 751 752 for (i = 0; i < 3; ++i) 753 { 754 for (j = 0; j < 3; ++j) 755 { 756 temp = 0.0f; 757 for (k = 0; k < 3; ++k) 758 { 759 temp += ans[i][k] * translate_r[k][j]; 760 } 761 rotatematrix[i][j] = temp; 762 } 763 } 764 return 0; 765 } 766 767 768 /*************************************************************************************************** 769 * 函数名:sys_time_get_utc_msec 770 * 功 能:读取系统时间 毫秒 771 * 参 数: 772 * 返回值:寄存器读数 773 * 备 注: 774 ***************************************************************************************************/ 775 int sys_time_get_utc_msec(UINT64* puUtcMSec) 776 { 777 #ifndef _WIN32 778 int iRet = 0; 779 struct timeval stTimeVal = { 0 }; 780 if (NULL == puUtcMSec) 781 { 782 return iRet; 783 } 784 iRet = gettimeofday(&stTimeVal, NULL); 785 if (iRet < 0) 786 { 787 return iRet; 788 } 789 //*puUtcMSec = (1000000 * stTimeVal.tv_sec + stTimeVal.tv_usec) / 1000; 790 *puUtcMSec = (1000000 * stTimeVal.tv_sec + stTimeVal.tv_usec); 791 return iRet; 792 793 #else 794 #define EPOCHFILETIME (116444736000000000UL) 795 FILETIME ft; 796 LARGE_INTEGER li; 797 UINT64 tt = 0; 798 GetSystemTimeAsFileTime(&ft); 799 li.LowPart = ft.dwLowDateTime; 800 li.HighPart = ft.dwHighDateTime; 801 // 从1970年1月1日0:0:0:000到现在的微秒数(UTC时间) 802 tt = (li.QuadPart - EPOCHFILETIME) / 10; 803 804 //*puUtcMSec = (tt / 1000); 805 *puUtcMSec = (tt); 806 return 0; 807 #endif 808 }opencl_test.c
2、内核函数
1 __kernel void TransposeKernel(__global uchar *src, __global uchar *dst, int width, int height) 2 { 3 uint g_idx = get_global_id(0); 4 uint g_idy = get_global_id(1); 5 if ((g_idx >= width) || (g_idy >= height)) 6 { 7 return; 8 } 9 //dst[g_idx * height + g_idy] = src[g_idy * width + g_idx]; 10 dst[g_idy * width + g_idx] = src[g_idy * width + g_idx]; 11 } 12 13 __kernel void image_rotate( 14 __global uchar * src_data, 15 __global uchar * dest_data, 16 //Data in global memory 17 int W, 18 int H, 19 //Image Dimensions 20 float sinTheta, 21 float cosTheta ) 22 //Rotation Parameters 23 { 24 //Thread gets its index within index space 25 const int ix = get_global_id(0); 26 const int iy = get_global_id(1); 27 28 int xc = W/2; 29 int yc = H/2; 30 31 int xpos = ( ix-xc)*cosTheta - (iy-yc)*sinTheta+xc; 32 int ypos = ( ix-xc)*sinTheta + (iy-yc)*cosTheta+yc; 33 34 if ((xpos>=0) && (xpos< W) && (ypos>=0) && (ypos< H)) //Bound Checking 35 { 36 dest_data[ypos*W+xpos]= src_data[iy*W+ix]; 37 } 38 } 39 40 __kernel void image_rotate_matrix( 41 __global uchar * src_data, 42 __global uchar * dest_data, 43 //Data in global memory 44 int W, 45 int H, 46 //Image Dimensions 47 __global int * matrix) 48 //Rotation Parameters 49 { 50 //Thread gets its index within index space 51 const int ix = get_global_id(0); 52 const int iy = get_global_id(1); 53 54 //dest_data[iy*W+ix]= src_data[iy*W+ix]; 55 int xpos = matrix[0] * ix + matrix[1] * iy + matrix[2]; 56 int ypos = matrix[3] * ix + matrix[4] * iy + matrix[5]; 57 58 xpos = (xpos >> 8); 59 ypos = (ypos >> 8); 60 if ((xpos>=0) && (xpos< W) && (ypos>=0) && (ypos< H)) //Bound Checking 61 { 62 dest_data[iy*W+ix] = src_data[ypos*W+xpos]; 63 } 64 if(ix == 640 && iy == 480) 65 { 66 printf("\n %d %d %d %d %d %d %d %d \n", matrix[0], matrix[1], matrix[2], matrix[3], matrix[4], matrix[5], xpos, ypos); 67 } 68 }kerneltest.cl
标签:float,const,cl,int,0.0,opencl,uchar,二维,数组 From: https://www.cnblogs.com/peifx/p/16987516.html