原始版本
用for循序对一个大数组(约80M)。
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#define N 10000000
uint64_t a[N];
int main() {
for(int i=0; i<N; i++) a[i] = random()%10000;
uint64_t sum = 0;
struct timespec start, end;
clock_gettime(CLOCK_MONOTONIC, &start);
#define DI 8
for(int i=0; i<N; i++) {
sum += a[i];
}
clock_gettime(CLOCK_MONOTONIC, &end);
double time_escape = 0;
time_escape = (end.tv_sec - start.tv_sec)*1e9;
time_escape = (time_escape + (end.tv_nsec - start.tv_nsec))* 1e-6;
printf("sum = %lu, time_escape = %f ms\n", sum, time_escape);
return 0;
}
结果如下,耗时约14ms.
sum = 49991029754, time_escape = 14.465031 ms
优化1
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#define N 10000000
uint64_t a[N];
int main() {
for(int i=0; i<N; i++) a[i] = random()%10000;
uint64_t sum = 0;
struct timespec start, end;
clock_gettime(CLOCK_MONOTONIC, &start);
#define DI 8
for(int i=0; i<N; i++) {
__builtin_prefetch(&a[i+8]);
sum += a[i];
}
clock_gettime(CLOCK_MONOTONIC, &end);
double time_escape = 0;
time_escape = (end.tv_sec - start.tv_sec)*1e9;
time_escape = (time_escape + (end.tv_nsec - start.tv_nsec))* 1e-6;
printf("sum = %lu, time_escape = %f\n", sum, time_escape);
return 0;
}
看到约能优化1ms
sum = 49991029754, time_escape = 13.362668
稍微分析一下,这里前8个uint64没有预期,在但取a[0]时加载了,等于只发生一次初始的cache miss。后面的数组元素都能预取到。但可以看到很多预取是重复的,
最好每隔8次预取一下。
优化2
考虑到上面思路里外层循环中每隔8个预取一次,可以将原始的loop拆分为两层,内层为8次,考虑到循环语句会引入很多指令,这里手动对内层循环进行展开,代码如下
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#define N 10000000
uint64_t a[N];
int main() {
for(int i=0; i<N; i++) a[i] = random()%10000;
uint64_t sum = 0;
struct timespec start, end;
clock_gettime(CLOCK_MONOTONIC, &start);
#define DI 8
for(int i=0; i<N; i+=DI) {
__builtin_prefetch(&a[i+DI]);
sum += a[i];
sum += a[i+1];
sum += a[i+2];
sum += a[i+3];
sum += a[i+4];
sum += a[i+5];
sum += a[i+6];
sum += a[i+7];
}
clock_gettime(CLOCK_MONOTONIC, &end);
double time_escape = 0;
time_escape = (end.tv_sec - start.tv_sec)*1e9;
time_escape = (time_escape + (end.tv_nsec - start.tv_nsec))* 1e-6;
printf("sum = %lu, time_escape = %f\n", sum, time_escape);
return 0;
}
结果如下,又优化了大概0.8ms。
sum = 49991029754, time_escape = 12.477051
标签:uint64,10000000,探索,int,预取,include,优化
From: https://www.cnblogs.com/zwlwf/p/17392513.html