基本思想:因为看NCNN源码,发现up主代码中,使用了OpenMP预编译指令,所以详细查阅了资料,先简单学习一下,等有时间在补充NCNN代码的中的实例,这里原理不详细叙述,只记录使用,以备后续用到好查阅;
个人感觉OpenMP 最难的问题在于如何把存在数据依赖的运算关系拆成相互独立的运算,进行并行加速,先把简单的记录一下,后期看大神代码 逐渐完善example吧~
编译工具使用Clion2021.1.1,因为使用OpenMP的主要目的在于加速处理指令,因此会着重比较一下串行程序和OpenMP并行程序的执行效率对比,有时间在补充OpenMP和多线程的执行效率对比
(1)OpenMP的常用的函数和使用:
测试代码:(cmakelists.txt不在重复黏贴)
cmake_minimum_required(VERSION 3.16)
project(OpenMP)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fopenmp ")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp ")
set(CMAKE_CXX_STANDARD 11)
add_executable(OpenMP main.cpp)
测试代码
#include <iostream>
#include <omp.h>
using namespace std;
int main() {
std::cout<<"the current thread id: "<<omp_get_thread_num()<<std::endl;
std::cout<<"the current thread num: "<<omp_get_num_threads()<<std::endl;
std::cout<<"the current processer num: "<<omp_get_num_procs()<<std::endl;
return 0;
}
结果:
F:\OpenMP\cmake-build-debug\OpenMP.exe
the current thread id: 0
the machine thread num: 1
the current processer num: 12
Process finished with exit code 0
测试显示:当前的代码运行的线程id=0,当前的代码仅用一个线程执行,本机总共有12个逻辑CPU
6核CPU,12个逻辑处理器 一颗内核在一个时间片内只能执行一个内核线程;当物理CPU使用了超线程技术后,在CPU的一颗内核中,利用就是利用其中空闲的执行单元,模拟出另外一个核心(并不是真正的物理运算核心),使得CPU的这颗内核有两个逻辑核心,也就是所谓的逻辑CPU,此时物理CPU的一颗内核在一个时间片内理论上可同时执行两个内核线程,从而提高了整个CPU的工作效率,此时逻辑CPU的数量=物理CPU的数量x单个CPU的内核数x2。值得注意的是,一颗内核并不代表只能有一个或者两个逻辑CPU,也可以有4个逻辑CPU或者更多。
(2)OpenMP的基本语法
基本语法为
#pragra omp parallel [for|sections]
{
.....//并行次数由PC的内核数决定
}
大括号里面的语句按照本机所含有的线程数执行
#include <iostream>
#include <omp.h>
#include<chrono>
using namespace std;
using namespace chrono;
void sequentialProgram(int num)
{
for(int i=0;i<num;i++)
{
// std::cout<<"hello world"<<std::endl;
printf("%s the current thread id: %d\n","hello world",omp_get_thread_num());
}
}
void parallelProgram(int num)
{
#pragma omp parallel
{
//std::cout<<"hello world"<<"the current thread id: "<<omp_get_thread_num()<<std::endl;
printf("%s the current thread id: %d\n","hello world",omp_get_thread_num());
};
}
int main() {
int num=omp_get_num_procs();
auto start_time=std::chrono::steady_clock::now();
sequentialProgram(num);
auto end_time=std::chrono::steady_clock::now();
std::cout<<"threadProgram elapse time: "<<std::chrono::duration<double>(end_time-start_time).count()<<" seconds"<<std::endl;
start_time=std::chrono::steady_clock::now();
parallelProgram(num);
end_time=std::chrono::steady_clock::now();
std::cout<<"parallelProgram elapse time: "<<std::chrono::duration<double>(end_time-start_time).count()<<" seconds"<<std::endl;
return 0;
}
结果可视,串行程序都在主线程id=0上执行,而OpenMP分别在0~11线程号上执行
F:\OpenMP\cmake-build-debug\OpenMP.exe
hello world the current thread id: 0
hello world the current thread id: 0
hello world the current thread id: 0
hello world the current thread id: 0
hello world the current thread id: 0
hello world the current thread id: 0
hello world the current thread id: 0
hello world the current thread id: 0
hello world the current thread id: 0
hello world the current thread id: 0
hello world the current thread id: 0
hello world the current thread id: 0
sequentialProgram elapse time: 0.024749 seconds
hello world the current thread id: 1
hello world the current thread id: 0
hello world the current thread id: 4
hello world the current thread id: 11
hello world the current thread id: 7
hello world the current thread id: 3
hello world the current thread id: 8
hello world the current thread id: 6
hello world the current thread id: 9
hello world the current thread id: 10
hello world the current thread id: 2
hello world the current thread id: 5
parallelProgram elapse time: 0.0278966 seconds
Process finished with exit code 0
(2)还可以指定线程数量,来运行代码
#pragma omp parallel num_threads(n)
{
....
}
或者
omp_set_num_threads(n);//设置环境变量
#pragma omp parallel
{
....
}
测试代码
#include <iostream>
#include <omp.h>
#include<chrono>
using namespace std;
using namespace chrono;
void sequentialProgram(int num)
{
for(int i=0;i<num;i++)
{
// std::cout<<"hello world"<<std::endl;
printf("%s the current thread id: %d\n","hello world",omp_get_thread_num());
}
}
void parallelProgram(int num)
{
omp_set_num_threads(6);
#pragma omp parallel
{
//std::cout<<"hello world"<<"the current thread id: "<<omp_get_thread_num()<<std::endl;
printf("%s the current thread id: %d\n","A hello world",omp_get_thread_num());
};
omp_set_num_threads(3);
#pragma omp parallel
{
//std::cout<<"hello world"<<"the current thread id: "<<omp_get_thread_num()<<std::endl;
printf("%s the current thread id: %d\n","B hello world",omp_get_thread_num());
};
#pragma omp parallel num_threads(2)
{
//std::cout<<"hello world"<<"the current thread id: "<<omp_get_thread_num()<<std::endl;
printf("%s the current thread id: %d\n","C hello world",omp_get_thread_num());
};
}
int main() {
int num=omp_get_num_procs();
auto start_time=std::chrono::steady_clock::now();
sequentialProgram(num);
auto end_time=std::chrono::steady_clock::now();
std::cout<<"threadProgram elapse time: "<<std::chrono::duration<double>(end_time-start_time).count()<<" seconds"<<std::endl;
start_time=std::chrono::steady_clock::now();
parallelProgram(num);
end_time=std::chrono::steady_clock::now();
std::cout<<"parallelProgram elapse time: "<<std::chrono::duration<double>(end_time-start_time).count()<<" seconds"<<std::endl;
return 0;
}
运行结果,可以分析其线程号
F:\OpenMP\cmake-build-debug\OpenMP.exe
hello world the current thread id: 0
hello world the current thread id: 0
hello world the current thread id: 0
hello world the current thread id: 0
hello world the current thread id: 0
hello world the current thread id: 0
hello world the current thread id: 0
hello world the current thread id: 0
hello world the current thread id: 0
hello world the current thread id: 0
hello world the current thread id: 0
hello world the current thread id: 0
threadProgram elapse time: 0.0622209 seconds
A hello world the current thread id: 1
A hello world the current thread id: 5
A hello world the current thread id: 0
A hello world the current thread id: 2
A hello world the current thread id: 4
A hello world the current thread id: 3
B hello world the current thread id: 1
B hello world the current thread id: 2
B hello world the current thread id: 0
C hello world the current thread id: 1
C hello world the current thread id: 0
parallelProgram elapse time: 0.0310061 seconds
Process finished with exit code 0
(3)随着次数num的测试增多,耗时会更明显 ,OpenMP效率明显更快
#pragma omp parallel for
for(int i=0;i<num;i++)
{
.......
};
测试代码
#include <iostream>
#include <omp.h>
#include<chrono>
using namespace std;
using namespace chrono;
void sequentialProgram(int num)
{
for(int i=0;i<num;i++)
{
// std::cout<<"hello world"<<std::endl;
// printf("%s the current thread id: %d\n","hello world",omp_get_thread_num());
}
}
void parallelProgram(int num)
{
#pragma omp parallel for
for(int i=0;i<num;i++)
{
//std::cout<<"hello world"<<"the current thread id: "<<omp_get_thread_num()<<std::endl;
//printf("%s the current thread id: %d\n","hello world",omp_get_thread_num());
};
}
int main() {
int num=omp_get_num_procs()*10000000;
auto start_time=std::chrono::steady_clock::now();
sequentialProgram(num);
auto end_time=std::chrono::steady_clock::now();
std::cout<<"sequentialProgram elapse time: "<<std::chrono::duration<double>(end_time-start_time).count()<<" seconds"<<std::endl;
start_time=std::chrono::steady_clock::now();
parallelProgram(num);
end_time=std::chrono::steady_clock::now();
std::cout<<"parallelProgram elapse time: "<<std::chrono::duration<double>(end_time-start_time).count()<<" seconds"<<std::endl;
return 0;
}
测试结果,时间对比消耗
F:\OpenMP\cmake-build-debug\OpenMP.exe
sequentialProgram elapse time: 0.268991 seconds
parallelProgram elapse time: 0.0172777 seconds
Process finished with exit code 0
(4)多个并行for指令处理
#pragma omp parallel
{
#pragma omp for
for(int i=0;i<num/2;i++)//num此为偶数
{
.....
}
#pragma omp for
for(int i=num/2;i<num;i++)
{
.......
}
}
测试代码
#include <iostream>
#include <omp.h>
#include<chrono>
using namespace std;
using namespace chrono;
void sequentialProgram(int num)
{
for(int i=0;i<num;i++)
{
// std::cout<<"hello world"<<std::endl;
// printf("%s the current thread id: %d\n","hello world",omp_get_thread_num());
}
}
void parallelProgram(int num)
{
#pragma omp parallel
{
#pragma omp for
for(int i=0;i<num/2;i++)
{
//std::cout<<"hello world"<<"the current thread id: "<<omp_get_thread_num()<<std::endl;
//printf("%s the current thread id: %d\n","hello world",omp_get_thread_num());
}
#pragma omp for
for(int i=num/2;i<num;i++) {
//std::cout<<"hello world"<<"the current thread id: "<<omp_get_thread_num()<<std::endl;
//printf("%s the current thread id: %d\n","hello world",omp_get_thread_num());
}
}
}
int main() {
int num=omp_get_num_procs()*10000000;
auto start_time=std::chrono::steady_clock::now();
sequentialProgram(num);
auto end_time=std::chrono::steady_clock::now();
std::cout<<"sequentialProgram elapse time: "<<std::chrono::duration<double>(end_time-start_time).count()<<" seconds"<<std::endl;
start_time=std::chrono::steady_clock::now();
parallelProgram(num);
end_time=std::chrono::steady_clock::now();
std::cout<<"parallelProgram elapse time: "<<std::chrono::duration<double>(end_time-start_time).count()<<" seconds"<<std::endl;
return 0;
}
测试结果
F:\OpenMP\cmake-build-debug\OpenMP.exe
sequentialProgram elapse time: 0.263229 seconds
parallelProgram elapse time: 0.0286121 seconds
Process finished with exit code 0
参考:《OpenmP编译原理及实现技术》 清华大学出版社
《OpenMP 简易教程》作者:周伟明 整理:Vae Anchoret
https://docs.microsoft.com/zh-cn/cpp/parallel/openmp/reference/openmp-functions?view=msvc-160
标签:std,函数,thread,parallel,current,num,world,OpenMP,id From: https://blog.51cto.com/u_12504263/5719060