算子工程中sim和cpu模式都运行在cpu上,两者有什么区别?
猜测sim的等效是在npu指令层面,cpu的等效仅仅在AsecendC的层面?
表现在sim可以完全等效得跑cce代码算子,而cpu只能跑AscendC的算子。
当cce代码保存为.cpp后缀的文件,采用ccec编译,需要指定-x cce
,即指定编译语言。
ccec -x cce --cce-aicore-arch=dav-c220-vec -c add.cce
编译出来的结果为x86格式的elf文件,但npu的指令放在了elf中的一个段。至于是怎么包含的?ccec
启动bisheng, 添加参数"-fcce-include-aibinary", "/tmp/add_custom-e50633/add_custom-dav-c220-vec.o"
将事先编译的npu object文件包含进来。
atc转换过程中,tbe调用build构建出的算子为纯粹npu格式的elf文件。
一个简单问题:bisheng是怎么区分编译npu的object还是x86+npu代码段的?
看到bisheng的一些关键参数:x86的嵌入+npu代码段时,--triple x86_64-unknown-linux-gnu -fcce-fatobj-compile
, 编译npu算子的命令参数为--triple hiipu64-hisilicon-cce -fcce-is-aicore
。
一个调用ccec生成纯粹npu object编译命令如下,
ccec -cc1 -triple hiipu64-hisilicon-cce -fcce-is-aicore "-resource-dir" "/home/zwl/Ascend/ascend-toolkit/8.0.RC2.alpha002/x86_64-linux/ccec_compiler/lib/clang/15.0.5" -include __clang_cce_runtime_wrapper.h -o add2.o -x cce add.cce "-internal-isystem" "/usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11" "-internal-isystem" "/usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/x86_64-linux-gnu/c++/11" "-internal-isystem" "/usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/backward" "-internal-isystem" "/home/zwl/Ascend/ascend-toolkit/8.0.RC2.alpha002/x86_64-linux/ccec_compiler/lib/clang/15.0.5/include" "-internal-isystem" "/usr/local/include" "-internal-isystem" "/usr/lib/gcc/x86_64-linux-gnu/11/../../../../x86_64-linux-gnu/include" "-internal-externc-isystem" "/usr/include/x86_64-linux-gnu" "-internal-externc-isystem" "/include" "-internal-externc-isystem" "/usr/include" -emit-obj -target-cpu dav-c220-vec
命令中几个参数可以关注一下:
- 包含的头文件
__clang_cce_runtime_wrapper.h
,这让你可以使用__aicore
等宏,各种vector和cube的指令。 -cc1
表示像clang前端传递参数并只运行ast到llvm-IR的步骤,最后加的-emit-obj
表示生成object文件,默认只会到ir自然退出。
上述命令只会编译cpp中加了__aicore__
的函数。aicore上的函数要求返回为void。假设cce中定义了一个函数fun
, 编译结果elf中会有一个fun
的函数和一个fun__
的对象,后者fun__
表示的是kernelArgSize,即函数如参数Byte数。
'+all' is not a recognized feature for this target (ignoring feature)
_Z8myadd_doPhS_S_S_:
000000f0: ff c3 00 d1 sub sp, sp, #0x30
000000f4: fd 7b 02 a9 stp x29, x30, [sp, #0x20]
000000f8: fd 83 00 91 add x29, sp, #0x20
000000fc: a0 83 1f f8 stur x0, [x29, #-0x8]
00000100: e1 0b 00 f9 str x1, [sp, #0x10]
00000104: e2 07 00 f9 str x2, [sp, #0x8]
00000108: e3 03 00 f9 str x3, [sp]
0000010c: e2 03 40 f9 ldr x2, [sp]
00000110: 00 01 80 52 mov w0, #0x8
00000114: e1 03 1f aa mov x1, xzr
00000118: 00 00 00 94 bl #0x0 // NotExistType : _Z21__cce_rtConfigureCalljPvS_ + 0
0000011c: 08 00 00 71 subs w8, w0, #0x0
00000120: e8 07 9f 1a cset w8, ne
00000124: e8 00 00 37 tbnz w8, #0x0, #0x1c
00000128: 01 00 00 14 b #0x4
0000012c: a0 83 5f f8 ldur x0, [x29, #-0x8]
00000130: e1 0b 40 f9 ldr x1, [sp, #0x10]
00000134: e2 07 40 f9 ldr x2, [sp, #0x8]
00000138: 00 00 00 94 bl #0x0 // NotExistType : myadd + 0
0000013c: 01 00 00 14 b #0x4
00000140: fd 7b 42 a9 ldp x29, x30, [sp, #0x20]
00000144: ff c3 00 91 add sp, sp, #0x30
00000148: c0 03 5f d6 ret
打开cce代码看,
extern "C" __global__ __aicore__ void myadd(GM_ADDR x, GM_ADDR y, GM_ADDR z)
{
...
}
#ifndef __CCE_KT_TEST__
void myadd_do(uint8_t* x, uint8_t* y, uint8_t* z, uint8_t* stream) {
myadd<<<8, nullptr, stream>>>((half*)x, (half*)y, (float*)z);
}
#endif
通过ccec -x cce -c add.cce -o add.o --cce-aicore-arch=dav-c220-vec
编译出add.o的aarch64 object文件,并将npu的指令内嵌到某个代码段,
一些在aarch64 CPU架构上的函数的反汇编如下:
// myadd_do函数,内部使用<<<>>>方式调用算子,三箭头的调用步骤是,将blockDim, l2ctrl, stream传给_Z21__cce_rtConfigureCalljPvS_函数(即__cce_rtConfigureCall(unsigned int, void*, void*))完成算子rt的配置,若配置结果不为零,直接跳出。否则将x,y,z三个参数传递给myadd的函数
_Z8myadd_doPhS_S_S_:
000000f0: ff c3 00 d1 sub sp, sp, #0x30
000000f4: fd 7b 02 a9 stp x29, x30, [sp, #0x20]
000000f8: fd 83 00 91 add x29, sp, #0x20
000000fc: a0 83 1f f8 stur x0, [x29, #-0x8]
00000100: e1 0b 00 f9 str x1, [sp, #0x10]
00000104: e2 07 00 f9 str x2, [sp, #0x8]
00000108: e3 03 00 f9 str x3, [sp]
0000010c: e2 03 40 f9 ldr x2, [sp]
00000110: 00 01 80 52 mov w0, #0x8
00000114: e1 03 1f aa mov x1, xzr
00000118: 00 00 00 94 bl #0x0 // NotExistType : _Z21__cce_rtConfigureCalljPvS_ + 0
0000011c: 08 00 00 71 subs w8, w0, #0x0
00000120: e8 07 9f 1a cset w8, ne
00000124: e8 00 00 37 tbnz w8, #0x0, #0x1c
00000128: 01 00 00 14 b #0x4
0000012c: a0 83 5f f8 ldur x0, [x29, #-0x8]
00000130: e1 0b 40 f9 ldr x1, [sp, #0x10]
00000134: e2 07 40 f9 ldr x2, [sp, #0x8]
00000138: 00 00 00 94 bl #0x0 // NotExistType : myadd + 0
0000013c: 01 00 00 14 b #0x4
00000140: fd 7b 42 a9 ldp x29, x30, [sp, #0x20]
00000144: ff c3 00 91 add sp, sp, #0x30
00000148: c0 03 5f d6 ret
// 传入blockDim, l2ctrl, stream三个参数,调用rtConfigureCall
_Z21__cce_rtConfigureCalljPvS_:
00000000: ff c3 00 d1 sub sp, sp, #0x30
00000004: fd 7b 02 a9 stp x29, x30, [sp, #0x20]
00000008: fd 83 00 91 add x29, sp, #0x20
0000000c: a0 c3 1f b8 stur w0, [x29, #-0x4]
00000010: e1 0b 00 f9 str x1, [sp, #0x10]
00000014: e2 07 00 f9 str x2, [sp, #0x8]
00000018: a0 c3 5f b8 ldur w0, [x29, #-0x4]
0000001c: e1 03 1f 2a mov w1, wzr
00000020: 00 00 00 94 bl #0x0 // NotExistType : __cce_getOrSetBlockNum + 0 //这里表示设置blockNum=8
00000024: a0 c3 5f b8 ldur w0, [x29, #-0x4]
00000028: e1 0b 40 f9 ldr x1, [sp, #0x10]
0000002c: e2 07 40 f9 ldr x2, [sp, #0x8]
00000030: 00 00 00 94 bl #0x0 // NotExistType : rtConfigureCall + 0
00000034: fd 7b 42 a9 ldp x29, x30, [sp, #0x20]
00000038: ff c3 00 91 add sp, sp, #0x30
0000003c: c0 03 5f d6 ret
myadd函数应该是elf中离调用npu指令最近的地方了,单独拎出来看,
//myadd输入为GM上三个地址,x,y,z,三个参数,调用了三次rtSetupArgument(在libruntime.so或者libruntime_camodel.so中),并检验设置情况,
//然后获取设置的blockNum, 最后将myadd的函数地址、myadd算子名字,算子名长度,blockNum四个参数给到`__cce_rtLaunch`启动执行。
//这里猜测rtLaunch中递归启动了blockDim个算子。rtLaunch实现也在runtime so中
myadd:
00000000: ff c3 00 d1 sub sp, sp, #0x30
00000004: fd 7b 02 a9 stp x29, x30, [sp, #0x20]
00000008: fd 83 00 91 add x29, sp, #0x20
0000000c: e8 03 00 aa mov x8, x0
00000010: a0 23 00 d1 sub x0, x29, #0x8
00000014: a8 83 1f f8 stur x8, [x29, #-0x8]
00000018: e1 0b 00 f9 str x1, [sp, #0x10]
0000001c: e2 07 00 f9 str x2, [sp, #0x8]
00000020: 01 01 80 d2 mov x1, #0x8
00000024: e2 03 1f aa mov x2, xzr
00000028: 00 00 00 94 bl #0x0 // NotExistType : rtSetupArgument + 0
0000002c: 08 00 00 71 subs w8, w0, #0x0
00000030: e8 07 9f 1a cset w8, ne
00000034: a8 03 00 37 tbnz w8, #0x0, #0x74
00000038: 01 00 00 14 b #0x4
0000003c: e0 43 00 91 add x0, sp, #0x10
00000040: 02 01 80 d2 mov x2, #0x8
00000044: e1 03 02 aa mov x1, x2
00000048: 00 00 00 94 bl #0x0 // NotExistType : rtSetupArgument + 0
0000004c: 08 00 00 71 subs w8, w0, #0x0
00000050: e8 07 9f 1a cset w8, ne
00000054: a8 02 00 37 tbnz w8, #0x0, #0x54
00000058: 01 00 00 14 b #0x4
0000005c: e0 23 00 91 add x0, sp, #0x8
00000060: 01 01 80 d2 mov x1, #0x8
00000064: 02 02 80 d2 mov x2, #0x10
00000068: 00 00 00 94 bl #0x0 // NotExistType : rtSetupArgument + 0
0000006c: 08 00 00 71 subs w8, w0, #0x0
00000070: e8 07 9f 1a cset w8, ne
00000074: a8 01 00 37 tbnz w8, #0x0, #0x34
00000078: 01 00 00 14 b #0x4
0000007c: e0 03 1f 2a mov w0, wzr
00000080: 21 00 80 52 mov w1, #0x1
00000084: 00 00 00 94 bl #0x0 // NotExistType : __cce_getOrSetBlockNum + 0 //表示get blockNum
00000088: e3 03 00 2a mov w3, w0
0000008c: 00 00 00 90 adrp x0, #0 // NotExistType : myadd + 0
00000090: 00 00 00 91 add x0, x0, #0x0 // NotExistType : myadd + 0
00000094: 01 00 00 90 adrp x1, #0 // NotExistType : .rodata.str1.1 + 0
00000098: 21 00 00 91 add x1, x1, #0x0 // NotExistType : .rodata.str1.1 + 0
0000009c: a2 00 80 d2 mov x2, #0x5
000000a0: 00 00 00 94 bl #0x0 // NotExistType : __cce_rtLaunch + 0
000000a4: 01 00 00 14 b #0x4
000000a8: fd 7b 42 a9 ldp x29, x30, [sp, #0x20]
000000ac: ff c3 00 91 add sp, sp, #0x30
000000b0: c0 03 5f d6 ret
其中__cce_getOrSetBlockNum
如下所示,type=0表示set,否则为get,不同thread获得的值不同。
inline __attribute__((alway_inline)) int __cce_getOrSetBlockNum(int value,
int type) {
static thread_local int local = 0;
if (type == 0)
local = value;
return local;
}
在cann包的__clang_cce_link.stub
文件中找到__cce_rtLaunch
的实现,分带profiler和不带profiler的版本,不带profiler基本就是直接调rtLaunch。
#ifdef HAS_PROFILER
__attribute__ ((visibility("hidden"))) __attribute__((weak)) void __cce_rtLaunch(
void *stubFunc,
char *kernelName,
unsigned long int length,
unsigned int bolckNum)
{
unsigned long int beginTime = 0;
unsigned long int endTime = 0;
unsigned long int opName = 0;
unsigned int threadId = 0;
MsprofRegisterCallback(8, ProfCtrlHandle); // 8 - CCE defined in msprof headerfile slog.h
if (__MsprofFlagL0 || __MsprofFlagL1) {
beginTime = MsprofSysCycleTime();
}
rtLaunch(stubFunc);
if (__MsprofFlagL0 || __MsprofFlagL1) {
endTime = MsprofSysCycleTime();
opName = MsprofGetHashId(kernelName, length);
threadId = (unsigned int)(syscall(SYS_gettid));
MsprofApi info;
info.magicNumber = 0x5a5a; //MSPROF_REPORT_DATA_MAGIC_NUM
info.level = 10000; //MSPROF_REPORT_NODE_LEVEL
info.type = 5; //MSPROF_REPORT_NODE_LAUNCH_TYPE
info.threadId = threadId;
info.reserve = 0;
info.beginTime = beginTime;
info.endTime = endTime;
info.itemId = opName;
MsprofReportApi(0, &info);
}
if (__MsprofFlagL1) {
MsprofCompactInfo nodeBasicInfo;
nodeBasicInfo.magicNumber = 0x5a5a; //MSPROF_REPORT_DATA_MAGIC_NUM
nodeBasicInfo.level = 10000; //MSPROF_REPORT_NODE_LEVEL
nodeBasicInfo.type = 0; //MSPROF_REPORT_NODE_BASIC_INFO_TYPE
nodeBasicInfo.threadId = threadId;
nodeBasicInfo.timeStamp = endTime;
nodeBasicInfo.data.nodeBasicInfo.opName = opName;
nodeBasicInfo.data.nodeBasicInfo.taskType = 0; //MSPROF_GE_TASK_TYPE_AI_CORE
nodeBasicInfo.data.nodeBasicInfo.opType = opName;
nodeBasicInfo.data.nodeBasicInfo.blockDim = bolckNum;
MsprofReportCompactInfo(0, &nodeBasicInfo, sizeof(MsprofCompactInfo));
}
}
#else
__attribute__ ((visibility("hidden"))) __attribute__((weak)) void __cce_rtLaunch(
void *stubFunc,
char *kernelName,
unsigned long int length,
unsigned int bolckNum)
{
(void)kernelName;
(void)length;
(void)bolckNum;
rtLaunch(stubFunc);
}
#endif
标签:__,00,0x0,sp,学习,x29,cce,AscendC
From: https://www.cnblogs.com/zwlwf/p/18259691