1. 目标:使用 NEON intrinsic 函数,对512*512 png 四通道图像顺时针旋转90度。
思路: 像素分块,对块内转置;再水平镜像。图像库使用 stb img
2. 代码
#include <stdio.h>
#include <arm_neon.h>
#include <stdlib.h>
#define STB_IMAGE_IMPLEMENTATION
#include "./stb/stb_image.h"
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "./stb/stb_image_write.h"
// #define DEBUG
int main()
{
//读取图像
int w,h,c;
#ifdef DEBUG
w=h=8;c=4;
uint8_t* src = (uint8_t*)calloc(w*h*c,1);
for(int i=0;i<h;i++)
{
for(int j=0;j<h*c;j++)
src[i*h*c+j] = j;
}
for(int i=0;i<h;i++)
{
for(int j=0;j<w*c;j+=4)
printf("%u%u%u%u ",*(dst+i*h*c+j),*(dst+i*h*c+j+1),*(dst+i*h*c+j+2),*(dst+i*h*c+j+3));
printf("\n");
}
printf("======\n");
#else
uint8_t *src = stbi_load("./pic.png",&w,&h,&c,0);
if(!src)
{
printf("load img failed.\n");
return 0;
}
else
printf("int w %d h %d c %d\n",w,h,c);//512 512 4
#endif
uint8_t *dst = (uint8_t*)calloc(w*h*c,sizeof(uint8_t));
int blockSize = 4;// 128/sizeof(src[0][0]);
for(int i=0;i<h;i+=blockSize)
{
for(int j=0;j<w;j+=blockSize)
{
uint32x4x4_t block = {0};
uint32x4x2_t blockTemp = {0};
//储存数据: 像素转置、然后水平翻转存储[i+m][j] -> [j][i+m] -> [j][N-(i+m)]
//加载块数据
for(int m=0;m<blockSize;m++)
block.val[m] = vreinterpretq_u32_u8(vld1q_u8(src+((i+m)*w+j)*c));
//像素转置
blockTemp = vtrnq_u32(block.val[0],block.val[1]);
block.val[0] = blockTemp.val[0];
block.val[1] = blockTemp.val[1];
blockTemp = vtrnq_u32(block.val[2],block.val[3]);
block.val[2] = blockTemp.val[0];
block.val[3] = blockTemp.val[1];
//没有 vtrnq_u64 所以手动交换数据
blockTemp.val[0] = vreinterpretq_u32_u64(vtrn1q_u64(vreinterpretq_u64_u32(block.val[0]),vreinterpretq_u64_u32(block.val[2])));
blockTemp.val[1] = vreinterpretq_u32_u64(vtrn2q_u64(vreinterpretq_u64_u32(block.val[0]),vreinterpretq_u64_u32(block.val[2])));
block.val[0] = blockTemp.val[0];
block.val[2] = blockTemp.val[1];
blockTemp.val[0] = vreinterpretq_u32_u64(vtrn1q_u64(vreinterpretq_u64_u32(block.val[1]),vreinterpretq_u64_u32(block.val[3])));
blockTemp.val[1] = vreinterpretq_u32_u64(vtrn2q_u64(vreinterpretq_u64_u32(block.val[1]),vreinterpretq_u64_u32(block.val[3])));
block.val[1] = blockTemp.val[0];
block.val[3] = blockTemp.val[1];
for(int m=0;m<blockSize;m++)
{
block.val[m] = vrev64q_u32(block.val[m]);
block.val[m] = vcombine_u32(vget_high_u32(block.val[m]),vget_low_u32(block.val[m]));
//存储
vst1q_u8(dst+((j+m)*h+(h-i-blockSize))*c,vreinterpretq_u8_u32(block.val[m]));
}
}
}
#ifdef DEBUG
for(int i=0;i<w;i++)
{
for(int j=0;j<h*c;j+=4)
printf("%u%u%u%u ",*(dst+i*h*c+j),*(dst+i*h*c+j+1),*(dst+i*h*c+j+2),*(dst+i*h*c+j+3));
printf("\n");
}
free(src);
#else
stbi_write_png("pic1.png",h,w,c,dst,h*c);
stbi_image_free(src);
#endif
free(dst);
return 0;
}