Topo模块提供System拓扑和Graph拓扑的构建接口,依赖XML模块。
链路
链路带宽
NCCL对不同链接进行建模,但数值的绝对值看起来并不准,笔者估计代码开源时做了保密处理,但只要数值的相对值不影响拓扑搜索便对执行逻辑没有影响,所以无需较真这些带宽数值。
#define LOC_BW 5000.0
#define SM60_NVLINK_BW 18.0
#define SM70_NVLINK_BW 20.0
#define SM80_NVLINK_BW 20.0
#define SM90_NVLINK_BW 20.6
#define SM86_NVLINK_BW 12.0
#define PCI_BW 12.0 // PCI Gen3 x16
#define QPI_BW 6.0
#define AMD_BW 16.0
#define SKL_QPI_BW 10.0
#define ZPI_BW 6.0
#define YONGFENG_ZPI_BW 9.0
#define P9_BW 32.0
#define ARM_BW 6.0
#define NET_BW 12.0 // 100Gbit
// Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU
// to GPU traffic consumes more PCI bandwidth.
#define INTEL_P2P_OVERHEAD(bw) (bw*6/5)
链路类型
节点之间的链接类型有5种,分别是回环,NVLink,PCI,CPU RC以及网络这5种。
// We want link types and path types to match as much as possible
#define LINK_LOC 0
#define LINK_NVL 1
// Skipping 2 for PATH_NVB
#define LINK_PCI 3
// Skipping 4 for PATH_PXB
// Skipping 5 for PATH_PXN
// Skipping 6 for PATH_PHB
#define LINK_SYS 7
#define LINK_NET 8
实际上,NVLink有两个亚种(直接通过NVLINK互联,通过NVIDIA SWITHC互联),PCI有4个亚种(只通过一个PCI Bridge互联,且不通过CPU RC;需要通过多个PCI Bridge互联,但不通过CPU RC;通过1个GPU和网络和远程GPU互联,此时GPU作为中转代理;需要通过CPU RC互联;),节点之间可能是断联关系,因此节点互联的路径可细分为10种可能。
// Local (myself)
#define PATH_LOC 0
// Connection traversing NVLink
#define PATH_NVL 1
// Connection through NVLink using an intermediate GPU
#define PATH_NVB 2
// Connection traversing at most a single PCIe bridge
#define PATH_PIX 3
// Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
#define PATH_PXB 4
// Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations.
#define PATH_PXN 5
// Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
#define PATH_PHB 6
// Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
#define PATH_SYS 7
// Connection through the network
#define PATH_NET 8
// Disconnected
#define PATH_DIS 9
节点
NCCL认为Topo的节点分为7种,分别是GPU,PCI,NVS,CPU(代表NUMA节点),NIC以及NET。这7种节点的关系并不孤立(如NIC节点可能属于PCI节点,因为高性能网卡可能是PCI设备),但是可以肯定的是System拓扑图中出现的节点必然是以下7种节点中的一种。
#define NCCL_TOPO_NODE_TYPES 7
#define GPU 0
#define PCI 1
#define NVS 2
#define CPU 3 // Actually NUMA domains
#define NIC 4
#define NET 5
结构体
节点逻辑链接由ncclTopoLink表示,物理链接由ncclTopoLinkList表示,节点由ncclTopoNode表示,节点图由ncclTopoNodeSet表示,整个互联系统由ncclTopoSystem表示,另外还有ncclTopoGraph表示搜索得出的逻辑拓扑。
ncclTopoLink
ncclTopoLink描述和远程节点系通过何种方式连接,带宽系多少。成员介绍请看注解。
struct ncclTopoLink {
int type; // 5 types, LINK_LOC, LINK_NVL, LINK_PCI, LINK_SYS, LINK_NET
float bw; // pre-defined bandwidth for different type of link
struct ncclTopoNode* remNode; // remote node
};
ncclTopoLinkList
ncclTopoLinkList描述与所有其他节点系如何连接,最优的连接类型为何,带宽系多少。成员介绍请看注解。
#define NCCL_TOPO_MAX_LINKS 128 // Max number of links per node
#define NCCL_TOPO_MAX_HOPS (NCCL_TOPO_MAX_NODES*NCCL_TOPO_NODE_TYPES) // Max number of hops
struct ncclTopoLinkList {
struct ncclTopoLink* list[NCCL_TOPO_MAX_HOPS]; // contains all ncclTopoLink to all nodes
int count; // node amount
float bw; // max bandwidth to remote node
int type; // max bandwidth type
};
ncclTopoNode
ncclTopoNode描述拓扑中的一个节点以及所有连接。成员介绍请看注解。
struct ncclTopoNode {
int type; // 7 types, GPU, PCI, NVS, CPU, NIC, NET
int64_t id; // node id
// Type specific data
union {
struct {
int dev; // NVML dev number
int rank;
int cudaCompCap;
int gdrSupport;
}gpu;
struct {
int dev; // Plugin dev number
uint64_t asic;
int port;
float bw;
float latency;
int gdrSupport;
int collSupport;
int maxChannels;
}net;
struct {
int arch;
int vendor;
int model;
cpu_set_t affinity;
}cpu;
struct {
uint64_t device;
}pci;
};
int nlinks; // Links amount
struct ncclTopoLink links[NCCL_TOPO_MAX_LINKS]; // Links to other nodes
struct ncclTopoLinkList* paths[NCCL_TOPO_NODE_TYPES]; // Pre-computed paths to GPUs and NICs
uint64_t used; // Used during search
};
ncclTopoNodeSet and ncclTopoSystem
ncclTopoNodeSet用于描述拓扑中的所有节点和其连接。ncclTopoSystem包含除了包含ncclTopoNodeSet外,还包含远程主机的systemId,以及localhost的maxBw和totalBw。成员介绍请看注解。
struct ncclTopoNodeSet {
int count; // number of nodes
struct ncclTopoNode nodes[NCCL_TOPO_MAX_NODES]; // all nodes
};
struct ncclTopoSystem {
int systemId; // localhost hash
uint64_t hostHashes[NCCL_TOPO_MAX_NODES]; // system id for different hosts
int nHosts; // number of hosts
struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES]; // all nodes in localhost
float maxBw; // max link bandwidth of localhost
float totalBw; // total bandwidth of localhost
};
接口
以下是笔者筛选的TOPO模块的关键接口(有的函数并不定义在topo.h中,这点请注意),其中最重要的莫过于ncclTopoGetSystem函数,该函数系构建ncclTopoSystem的入口,上层函数通过调用ncclTopoSystem构建系统拓扑。其余函数主要为ncclTopoGetSystem函数提供功能接口,如增删查和节点连接等。
// create system topo
ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system);
// basic function for system topo maintainment
ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id);
ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id);
ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int id);
ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float bw);
intermediateRank);
ncclTopoCreateNode
ncclTopoCreateNode函数创建一个ncclTopoNode节点,节点类型和ID由函数调用者提供,创建的节点使用ncclTopoSystem中的静态空间(因此没使用类似malloc等动态内存分配函数),节点的其他所有属性均未定义(对于GPU节点会初始化一个回路链路),节点的地址也会由函数参数传出。详细介绍请看代码注解。
ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) {
if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) {
WARN("Error : tried to create too many nodes of type %d", type);
return ncclInternalError;
}
struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count;
system->nodes[type].count++;
n->type = type;
n->id = id;
if (type == GPU) {
// Create link to itself (used in some corner cases)
n->nlinks=1;
n->links[0].type = LINK_LOC;
n->links[0].remNode = n;
n->links[0].bw = LOC_BW;
n->gpu.dev = NCCL_TOPO_UNDEF;
n->gpu.rank = NCCL_TOPO_UNDEF;
n->gpu.cudaCompCap = NCCL_TOPO_UNDEF;
} else if (type == CPU) {
n->cpu.arch = NCCL_TOPO_UNDEF;
n->cpu.vendor = NCCL_TOPO_UNDEF;
n->cpu.model = NCCL_TOPO_UNDEF;
} else if (type == NET) {
n->net.asic = 0ULL;
n->net.port = NCCL_TOPO_UNDEF;
n->net.bw = 0.0;
n->net.latency = 0.0;
}
*node = n;
return ncclSuccess;
}
ncclTopoRemoveNode
ncclTopoRemoveNode将下标为index的type类型的节点删除。删除节点除了需要将此节点从system->nodes中删除外,还需要更新节点之间的连接信息。详细介绍请看代码注解。
ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int index) {
struct ncclTopoNode* delNode = system->nodes[type].nodes+index;
// traversal all nodes' links to remove link to delNode
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
free(delNode->paths[t]);
for (int n=0; n<system->nodes[t].count; n++) {
struct ncclTopoNode* node = system->nodes[t].nodes+n;
if (node == delNode) continue;
// traversal node's links, override delNode and successor move forward
for (int l=0; l<node->nlinks; l++) {
while (l<node->nlinks && node->links[l].remNode == delNode) {
memmove(node->links+l, node->links+l+1, (node->nlinks-l-1)*sizeof(struct ncclTopoLink));
node->nlinks--;
}
if (l<node->nlinks && node->links[l].remNode->type == type && node->links[l].remNode >= delNode) {
node->links[l].remNode--;
}
}
}
}
// delete delNode from system->nodes
memmove(delNode, delNode+1, (system->nodes[type].count-index-1)*sizeof(struct ncclTopoNode));
system->nodes[type].count--;
return ncclSuccess;
}
ncclTopoGetNode
ncclTopoGetNode通过传入type和id查询从system->nodes中获取节点地址。详细介绍请看代码注解。
ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) {
for (int i=0; i<system->nodes[type].count; i++) {
if (system->nodes[type].nodes[i].id == id) {
*node = system->nodes[type].nodes+i;
return ncclSuccess;
}
}
return ncclSuccess;
}
ncclTopoConnectNodes
ncclTopoConnectNodes增加node和remNode之间的连接,连接类型为type,带宽是bw。注意,添加连接可能是带宽叠加(从代码注释看,带宽叠场景主要针对NVLINK,对应的物理意义便是GPU之间NVLINK的LANE增加一条)。函数最后会对节点node的所有连接进行排序更新,保持带宽高的互联节点在较前面,为后续的逻辑拓扑搜索提供了便利。详细介绍请看代码注解。
// connect node to remNode with a given type and bandwidths
ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float bw) {
// Aggregate links into higher bw for NVLink
struct ncclTopoLink* link;
for (link = node->links; link - node->links != NCCL_TOPO_MAX_LINKS && link->remNode; link++) {
if (link->remNode == remNode && link->type == type) break;
}
if (link - node->links == NCCL_TOPO_MAX_LINKS) {
WARN("Error : too many Topo links (max %d)", NCCL_TOPO_MAX_LINKS);
return ncclInternalError;
}
// remNode == NULL means no link to remNode, so nlinks++
if (link->remNode == NULL) node->nlinks++;
link->type = type;
link->remNode = remNode;
link->bw += bw;
// Sort links in BW descending order
struct ncclTopoLink linkSave;
memcpy(&linkSave, link, sizeof(struct ncclTopoLink));
while (link != node->links) {
if ((link-1)->bw >= linkSave.bw) break;
memcpy(link, link-1, sizeof(struct ncclTopoLink));
link--;
}
memcpy(link, &linkSave, sizeof(struct ncclTopoLink));
return ncclSuccess;
ncclTopoGetSystem
ncclTopoGetSystem是Topo模块最重要的函数,NCCL在初始化过程中会调用此函数获区系统拓扑,以便后续的逻辑拓扑搜索。本地系统通过XML模块完成GPU和NIC的发现,对于支持南向互联的情况,XML数据会序列化后通过Bootstrap网络使得超节点通过AllGather的方式获取所有Peer的XML数据,随后将所有Peer的数据进行合并,最后通过ncclTopoGetSystemFromXml的方式创建ncclTopoSystem。
ncclTopoGetSystemFromXml首先通过ncclTopoAddCpu对每个NUMA节点进行递归地发掘PCI相关设备和NIC设备,此时能所有节点构造完毕,但是节点之间的连接信息还不全面。后续的ncclTopoAddNvLinks,ncclTopoAddC2c以及ncclTopoConnectCpus便是追加连接信息。
最后在系统拓扑构建完成后通过ncclTopoSortSystem对系统拓扑进行排序,目的是使得PCI树中的UPLINK存放在连接数组的末位,方便后续逻辑拓扑计算。
以下是ncclTopoGetSystem的主要函数调用图。
详细介绍请看代码注解。
ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
struct ncclXml* xml;
NCCLCHECK(xmlAlloc(&xml, NCCL_TOPO_XML_MAX_NODES));
// able to read pre-defined topo info from xml file
const char* xmlTopoFile = ncclGetEnv("NCCL_TOPO_FILE");
if (xmlTopoFile) {
INFO(NCCL_ENV, "NCCL_TOPO_FILE set by environment to %s", xmlTopoFile);
NCCLCHECK(ncclTopoGetXmlFromFile(xmlTopoFile, xml, 1));
} else {
// Try default XML topology location
NCCLCHECK(ncclTopoGetXmlFromFile("/var/run/nvidia-topologyd/virtualTopology.xml", xml, 0));
}
if (xml->maxIndex == 0) {
// Create top tag
struct ncclXmlNode* top;
NCCLCHECK(xmlAddNode(xml, NULL, "system", &top));
NCCLCHECK(xmlSetAttrInt(top, "version", NCCL_TOPO_XML_VERSION));
}
// Auto-detect GPUs if needed
for (int r=0; r<comm->nRanks; r++) {
// on same host
if (comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) {
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
struct ncclXmlNode* node;
NCCLCHECK(ncclTopoFillGpu(xml, busId, &node));
if (node == NULL) continue;
NCCLCHECK(xmlSetAttrInt(node, "keep", 1));
NCCLCHECK(xmlSetAttrInt(node, "rank", r));
NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport));
}
}
// Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
// so we start with collnet so that it has precedence.
int netDevCount = 0;
if (collNetSupport(comm)) {
NCCLCHECK(collNetDevices(comm, &netDevCount));
for (int n=0; n<netDevCount; n++) {
ncclNetProperties_t props;
NCCLCHECK(collNetGetProperties(comm, n, &props));
struct ncclXmlNode* netNode;
NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
NCCLCHECK(xmlInitAttrInt(netNode, "coll", 1));
}
}
if (netDevCount == 0) {
NCCLCHECK(comm->ncclNet->devices(&netDevCount));
}
for (int n=0; n<netDevCount; n++) {
ncclNetProperties_t props;
NCCLCHECK(comm->ncclNet->getProperties(n, &props));
struct ncclXmlNode* netNode;
NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
NCCLCHECK(xmlInitAttrFloat(netNode, "latency", props.latency));
NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
}
// Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
NCCLCHECK(ncclTopoTrimXml(xml));
// merge multiple host into one system if south gpu connection supported in hardware
if (comm->MNNVL) {
// MNNVL clique support
char* mem;
NCCLCHECK(ncclCalloc(&mem, comm->clique.size * xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
struct ncclXml* rankXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*comm->cliqueRank);
memcpy(rankXml, xml, xmlMemSize(NCCL_TOPO_XML_MAX_NODES));
NCCLCHECK(ncclTopoConvertXml(rankXml, (uintptr_t)xml->nodes, 1));
NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->clique.ranks, comm->cliqueRank, comm->clique.size, mem, xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
struct ncclXml* cliqueXml;
NCCLCHECK(xmlAlloc(&cliqueXml, comm->clique.size*NCCL_TOPO_XML_MAX_NODES));
for (int i = 0; i < comm->clique.size; i++) {
struct ncclXml* peerXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*i);
NCCLCHECK(ncclTopoConvertXml(peerXml, (uintptr_t)peerXml->nodes, 0));
NCCLCHECK(ncclTopoFuseXml(cliqueXml, peerXml));
}
free(xml);
xml = cliqueXml;
}
xmlTopoFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE");
if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile);
NCCLCHECK(ncclTopoDumpXmlToFile(xmlTopoFile, xml));
}
// generate ncclTopoSystem using ncclXml
NCCLCHECK(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash));
free(xml);
return ncclSuccess;
}
ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem, const uint64_t localHostHash) {
NCCLCHECK(ncclCalloc(topoSystem, 1));
struct ncclTopoSystem* system = *topoSystem;
struct ncclXmlNode* topNode;
NCCLCHECK(xmlFindTag(xml, "system", &topNode));
// Create system topo for each numa node
for (int s=0; s<topNode->nSubs; s++) {
struct ncclXmlNode* node = topNode->subs[s];
if (strcmp(node->name, "cpu") == 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem));
}
for (int systemId=0; systemId<system->nHosts; systemId++) if (system->hostHashes[systemId] == localHostHash) system->systemId = systemId;
// Add NVLinks for GPU-GPU or CPU-GPU connections
NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL, 0));
// Add C2C links for GPU-CPU connections
NCCLCHECK(ncclTopoAddC2c(topNode, *topoSystem, NULL, 0));
// Flatten BCM switches
NCCLCHECK(ncclTopoFlattenBcmSwitches(*topoSystem));
// Connect all CPU nodes together
NCCLCHECK(ncclTopoConnectCpus(*topoSystem));
// Sort the system
NCCLCHECK(ncclTopoSortSystem(*topoSystem));
return ncclSuccess;
}
标签:NCCL,node,struct,int,拓扑,Topo,NCCLCHECK,type
From: https://blog.csdn.net/weixin_42371021/article/details/140261679