以下代码基于VS2015、Qt5.9和OpenCV430,CPU型号是Intel Core i5-7400。功能是对图像进行二值化。下面直接上代码:
void main() { Mat image(1024, 1024, CV_8UC1, Scalar(255)); circle(image, Point2i(500, 500), 200, Scalar(0), -1); int64 t1, t2; Mat binar1(image.size(), image.type()); Mat binar2(image.size(), image.type()); // 确保32字节对齐 ASSERT(int64(image.data) % 32 == 0); ASSERT(int64(binar1.data) % 32 == 0); ASSERT(int64(binar2.data) % 32 == 0); t1 = getTickCount(); threshold(image, binar1, 127, 255, THRESH_BINARY); t2 = getTickCount(); qDebug() << u8"OPENCV(ms):" << (t2 - t1) / getTickFrequency() * 1000; t1 = getTickCount(); for (int i = 0; i < 1024; i++) { const uchar* line = image.ptr<uchar>(i); uchar* dest = binar2.ptr<uchar>(i); for (int j = 0; j < 1024; j++) { dest[j] = line[j] > 127 ? 255 : 0; } } t2 = getTickCount(); qDebug() << u8"NONE(ms):" << (t2 - t1) / getTickFrequency() * 1000; t1 = getTickCount(); __m128i m128t = _mm_set_epi16(127, 127, 127, 127, 127, 127, 127, 127); __m128i m128h = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 14, 12, 10, 8, 6, 4, 2, 0); for (int i = 0; i < 1024; i++) { const uchar* line = image.ptr<uchar>(i); uchar* dest = binar2.ptr<uchar>(i); for (int j = 0; j < 1024; j += 8) { __m128i mmx08 = _mm_set_epi64x(0, *(int64*)&line[j]); __m128i mmx16 = _mm_cvtepu8_epi16(mmx08); __m128i res = _mm_cmplt_epi16(m128t, mmx16); __m128i half = _mm_shuffle_epi8(res, m128h); *(int64*)&dest[j] = _mm_extract_epi64(half, 0); } } t2 = getTickCount(); qDebug() << u8"SSE(ms):" << (t2 - t1) / getTickFrequency() * 1000; t1 = getTickCount(); __m256i m256t = _mm256_set1_epi16(127); __m256i m256h = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 14, 12, 10, 8, 6, 4, 2, 0, -1, -1, -1, -1, -1, -1, -1, -1, 14, 12, 10, 8, 6, 4, 2, 0); for (int i = 0; i < 1024; i++) { const uchar* line = image.ptr<uchar>(i); uchar* dest = binar2.ptr<uchar>(i); for (int j = 0; j < 1024; j += 16) { __m128i mmx08 = _mm_set_epi64x(*(int64*)&line[j + 8], *(int64*)&line[j]); __m256i mmx16 = _mm256_cvtepu8_epi16(mmx08); __m256i res = _mm256_cmpgt_epi16(mmx16, m256t); __m256i half = _mm256_shuffle_epi8(res, m256h); *(int64*)&dest[j] = _mm256_extract_epi64(half, 0); *(int64*)&dest[j + 8] = _mm256_extract_epi64(half, 2); } } t2 = getTickCount(); qDebug() << u8"AVX(ms):" << (t2 - t1) / getTickFrequency() * 1000; }
在Release版下执行50次的输出如下。从这一批次的输出可知AVX优化的运行效率大部分都能超过OpenCV的运行效率:
OPENCV(ms): 2.0732 NONE(ms): 0.7314 SSE(ms): 0.2543 AVX(ms): 0.2199 OPENCV(ms): 0.4455 NONE(ms): 0.7666 SSE(ms): 0.293 AVX(ms): 0.179 OPENCV(ms): 0.6254 NONE(ms): 0.8789 SSE(ms): 0.2223 AVX(ms): 0.1512 OPENCV(ms): 0.4486 NONE(ms): 0.7306 SSE(ms): 0.2154 AVX(ms): 0.175 OPENCV(ms): 0.5774 NONE(ms): 2.3402 SSE(ms): 0.2871 AVX(ms): 0.2766 OPENCV(ms): 0.3737 NONE(ms): 0.7787 SSE(ms): 0.3047 AVX(ms): 0.3284 OPENCV(ms): 0.3145 NONE(ms): 0.7349 SSE(ms): 0.3549 AVX(ms): 0.3025 OPENCV(ms): 0.4318 NONE(ms): 0.7679 SSE(ms): 2.4315 AVX(ms): 0.2681 OPENCV(ms): 0.3959 NONE(ms): 0.9343 SSE(ms): 0.3756 AVX(ms): 0.439 OPENCV(ms): 0.3512 NONE(ms): 2.4505 SSE(ms): 0.377 AVX(ms): 0.2237 OPENCV(ms): 0.5284 NONE(ms): 0.7935 SSE(ms): 0.4699 AVX(ms): 0.2633 OPENCV(ms): 0.4671 NONE(ms): 0.8124 SSE(ms): 0.2919 AVX(ms): 0.2929 OPENCV(ms): 0.5293 NONE(ms): 0.7665 SSE(ms): 0.3181 AVX(ms): 0.408 OPENCV(ms): 0.6264 NONE(ms): 0.8933 SSE(ms): 0.2657 AVX(ms): 0.3929 OPENCV(ms): 0.5343 NONE(ms): 0.8591 SSE(ms): 0.3004 AVX(ms): 0.8155 ...<输出太多删除一部分> OPENCV(ms): 0.3946 NONE(ms): 1.2074 SSE(ms): 0.3121 AVX(ms): 0.3349 OPENCV(ms): 0.6635 NONE(ms): 0.8499 SSE(ms): 0.2915 AVX(ms): 0.3152 OPENCV(ms): 0.6398 NONE(ms): 0.9685 SSE(ms): 0.3917 AVX(ms): 0.2999 OPENCV(ms): 0.3454 NONE(ms): 0.9082 SSE(ms): 0.3983 AVX(ms): 0.3385 OPENCV(ms): 0.3415 NONE(ms): 1.035 SSE(ms): 0.3842 AVX(ms): 0.2633 OPENCV(ms): 0.4105 NONE(ms): 1.1947 SSE(ms): 0.3958 AVX(ms): 0.3525 OPENCV(ms): 0.612 NONE(ms): 0.9998 SSE(ms): 0.3176 AVX(ms): 0.3837 OPENCV(ms): 0.4727 NONE(ms): 0.8645 SSE(ms): 0.2794 AVX(ms): 0.2068 OPENCV(ms): 0.6206 NONE(ms): 0.9266 SSE(ms): 0.3822 AVX(ms): 0.3107 OPENCV(ms): 0.6847 NONE(ms): 0.9386 SSE(ms): 0.3073 AVX(ms): 0.4238 OPENCV(ms): 0.4841 NONE(ms): 1.002 SSE(ms): 0.2424 AVX(ms): 0.2825 OPENCV(ms): 0.5021 NONE(ms): 1.2102 SSE(ms): 0.3045 AVX(ms): 0.2816 OPENCV(ms): 0.6298 NONE(ms): 1.6238 SSE(ms): 0.4122 AVX(ms): 0.2643 OPENCV(ms): 0.8655 NONE(ms): 1.0023 SSE(ms): 0.3301 AVX(ms): 0.3396 OPENCV(ms): 0.6918 NONE(ms): 0.8999 SSE(ms): 0.2622 AVX(ms): 0.1829
标签:NONE,AVX,指令集,OPENCV,int64,ms,图像,SSE,二值化 From: https://www.cnblogs.com/mengxiangdu/p/17244938.html