AVX加速卷积part2
重新构筑下昨天的想法:
问题:
源程序在O2下的执行时间:
经过AVX改进后的执行时间:
下面尝试在AVX2基础上改进:
AVX与AVX2的主要区别和改进:
向量整数指令:
AVX主要集中在浮点数运算上,提供了对256位宽SIMD(单指令多数据)向量的支持。
AVX2引入了向量整数运算的支持。这意味着,除了浮点运算外,AVX2还可以对256位的整数向量执行各种运算,这在处理图形、视频处理和数字信号处理等领域特别有用。
FMA(Fused Multiply-Add)指令:
虽然FMA指令首次出现在AVX中(作为AVX的扩展,称为FMA3),但AVX2进一步加强了对这些指令的支持。FMA指令可以在单个操作中完成乘法和加法,从而提高了计算的效率和准确性。
Gather指令:
AVX2引入了gather指令,这允许从非连续内存位置加载数据到一个寄存器中。这对于处理稀疏矩阵和不规则数据结构特别有用,因为它可以减少必须手动编码的复杂内存访问模式。
向量化的位操作:
AVX2增加了对向量位操作的支持,使得位级并行处理成为可能,这在某些算法中可以极大提高性能。
扩展的整数指令集:
AVX2通过扩展的整数指令集,提供了更多处理整数向量的能力,如增加、减少、乘法等操作。
FMA func.
// conv_avx2.cpp
bool Convolve1D_Ks5_F64_AVX(double* __restrict__ y, const double* __restrict__ x, const double* __restrict__ kernel, int64_t num_pts) {
constexpr int64_t kernel_size = 5;
constexpr int64_t ks2 = kernel_size / 2;
// Ensure the input size is sufficient for the convolution operation
if (num_pts < kernel_size) {
return false;
}
// Load the convolution kernel into AVX2 registers
__m256d k0 = _mm256_set1_pd(kernel[0]);
__m256d k1 = _mm256_set1_pd(kernel[1]);
__m256d k2 = _mm256_set1_pd(kernel[2]);
__m256d k3 = _mm256_set1_pd(kernel[3]);
__m256d k4 = _mm256_set1_pd(kernel[4]);
// Perform convolution operations in chunks using AVX2
for (int64_t i = ks2; i <= num_pts - kernel_size; i += 4) {
// Load chunks of the input signal into AVX2 registers
// std::cout << i << '\n';
__m256d x0 = _mm256_loadu_pd(&x[i + 2]);
__m256d x1 = _mm256_loadu_pd(&x[i + 1]);
__m256d x2 = _mm256_loadu_pd(&x[i]);
__m256d x3 = _mm256_loadu_pd(&x[i - 1]);
__m256d x4 = _mm256_loadu_pd(&x[i - 2]);
// Perform element-wise multiplication and sum the results
__m256d y_val = _mm256_setzero_pd();
y_val = _mm256_fmadd_pd(x0,k0,y_val);
y_val = _mm256_fmadd_pd(x1,k1,y_val);
y_val = _mm256_fmadd_pd(x2,k2,y_val);
y_val = _mm256_fmadd_pd(x3,k3,y_val);
y_val = _mm256_fmadd_pd(x4,k4,y_val);
// Store the convolution results back to the output array
_mm256_storeu_pd(&y[i], y_val);
}
// Handle the remainder of the elements that don't fit into a multiple of 4
for (int64_t i = num_pts - ks2; i < num_pts; i++) {
double result = 0.0;
for (int64_t k = -ks2; k <= ks2; k++) {
result += x[i - k] * kernel[k + ks2];
}
y[i] = result;
}
return true;
}
在AVX512指令集下进行改进:
// conv_avx512.cpp
bool Convolve1D_Ks5_F64_AVX512(double* __restrict__ y, const double* __restrict__ x, const double* __restrict__ kernel, int64_t num_pts) {
constexpr int64_t kernel_size = 5;
constexpr int64_t ks2 = kernel_size / 2;
// Ensure the input size is sufficient for the convolution operation
if (num_pts < kernel_size) {
return false;
}
// Load the convolution kernel into AVX512 registers
__m512d k0 = _mm512_set1_pd(kernel[0]);
__m512d k1 = _mm512_set1_pd(kernel[1]);
__m512d k2 = _mm512_set1_pd(kernel[2]);
__m512d k3 = _mm512_set1_pd(kernel[3]);
__m512d k4 = _mm512_set1_pd(kernel[4]);
// Perform convolution operations in chunks using AVX512
for (int64_t i = ks2; i <= num_pts - kernel_size; i += 8) { // Note: Change stride to 8 for AVX512
// Load chunks of the input signal into AVX512 registers
__m512d x0 = _mm512_loadu_pd(&x[i + 2]);
__m512d x1 = _mm512_loadu_pd(&x[i + 1]);
__m512d x2 = _mm512_loadu_pd(&x[i]);
__m512d x3 = _mm512_loadu_pd(&x[i - 1]);
__m512d x4 = _mm512_loadu_pd(&x[i - 2]);
// Perform element-wise multiplication and sum the results
__m512d y_val = _mm512_setzero_pd();
y_val = _mm512_fmadd_pd(x0, k0, y_val);
y_val = _mm512_fmadd_pd(x1, k1, y_val);
y_val = _mm512_fmadd_pd(x2, k2, y_val);
y_val = _mm512_fmadd_pd(x3, k3, y_val);
y_val = _mm512_fmadd_pd(x4, k4, y_val);
// Store the convolution results back to the output array
_mm512_storeu_pd(&y[i], y_val);
}
// Handle the remainder of the elements that don't fit into a multiple of 8
for (int64_t i = num_pts - ks2; i < num_pts; i++) {
double result = 0.0;
for (int64_t k = -ks2; k <= ks2; k++) {
result += x[i - k] * kernel[k + ks2];
}
y[i] = result;
}
return true;
}
执行结果: