NCCL拓扑管理 - Topo模块 - IPS99技术分享

标签：NCCL node struct int 拓扑 Topo NCCLCHECK type

Topo模块提供System拓扑和Graph拓扑的构建接口，依赖XML模块。

链路

链路带宽

NCCL对不同链接进行建模，但数值的绝对值看起来并不准，笔者估计代码开源时做了保密处理，但只要数值的相对值不影响拓扑搜索便对执行逻辑没有影响，所以无需较真这些带宽数值。

#define LOC_BW 5000.0
#define SM60_NVLINK_BW 18.0
#define SM70_NVLINK_BW 20.0
#define SM80_NVLINK_BW 20.0
#define SM90_NVLINK_BW 20.6
#define SM86_NVLINK_BW 12.0
#define PCI_BW 12.0           // PCI Gen3 x16
#define QPI_BW 6.0
#define AMD_BW 16.0
#define SKL_QPI_BW 10.0
#define ZPI_BW 6.0
#define YONGFENG_ZPI_BW 9.0
#define P9_BW 32.0
#define ARM_BW 6.0
#define NET_BW 12.0           // 100Gbit

// Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU
// to GPU traffic consumes more PCI bandwidth.
#define INTEL_P2P_OVERHEAD(bw) (bw*6/5)

链路类型

节点之间的链接类型有5种，分别是回环，NVLink，PCI，CPU RC以及网络这5种。

// We want link types and path types to match as much as possible
#define LINK_LOC 0
#define LINK_NVL 1
// Skipping 2 for PATH_NVB
#define LINK_PCI 3
// Skipping 4 for PATH_PXB
// Skipping 5 for PATH_PXN
// Skipping 6 for PATH_PHB
#define LINK_SYS 7
#define LINK_NET 8

实际上，NVLink有两个亚种(直接通过NVLINK互联，通过NVIDIA SWITHC互联)，PCI有4个亚种（只通过一个PCI Bridge互联，且不通过CPU RC；需要通过多个PCI Bridge互联，但不通过CPU RC；通过1个GPU和网络和远程GPU互联，此时GPU作为中转代理；需要通过CPU RC互联；），节点之间可能是断联关系，因此节点互联的路径可细分为10种可能。

// Local (myself)
#define PATH_LOC 0

// Connection traversing NVLink
#define PATH_NVL 1

// Connection through NVLink using an intermediate GPU
#define PATH_NVB 2

// Connection traversing at most a single PCIe bridge
#define PATH_PIX 3

// Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
#define PATH_PXB 4

// Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations.
#define PATH_PXN 5

// Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
#define PATH_PHB 6

// Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
#define PATH_SYS 7

// Connection through the network
#define PATH_NET 8

// Disconnected
#define PATH_DIS 9

节点

NCCL认为Topo的节点分为7种，分别是GPU，PCI，NVS，CPU（代表NUMA节点），NIC以及NET。这7种节点的关系并不孤立（如NIC节点可能属于PCI节点，因为高性能网卡可能是PCI设备），但是可以肯定的是System拓扑图中出现的节点必然是以下7种节点中的一种。

#define NCCL_TOPO_NODE_TYPES 7
#define GPU 0
#define PCI 1
#define NVS 2
#define CPU 3 // Actually NUMA domains
#define NIC 4
#define NET 5

结构体

节点逻辑链接由ncclTopoLink表示，物理链接由ncclTopoLinkList表示，节点由ncclTopoNode表示，节点图由ncclTopoNodeSet表示，整个互联系统由ncclTopoSystem表示，另外还有ncclTopoGraph表示搜索得出的逻辑拓扑。

ncclTopoLink

ncclTopoLink描述和远程节点系通过何种方式连接，带宽系多少。成员介绍请看注解。

struct ncclTopoLink {
  int type; // 5 types, LINK_LOC, LINK_NVL, LINK_PCI, LINK_SYS, LINK_NET
  float bw; // pre-defined bandwidth for different type of link
  struct ncclTopoNode* remNode; // remote node
};

ncclTopoLinkList

ncclTopoLinkList描述与所有其他节点系如何连接，最优的连接类型为何，带宽系多少。成员介绍请看注解。

#define NCCL_TOPO_MAX_LINKS 128 // Max number of links per node
#define NCCL_TOPO_MAX_HOPS (NCCL_TOPO_MAX_NODES*NCCL_TOPO_NODE_TYPES) // Max number of hops

struct ncclTopoLinkList {
  struct ncclTopoLink* list[NCCL_TOPO_MAX_HOPS]; // contains all ncclTopoLink to all nodes
  int count; 	// node amount 
  float bw;  	// max bandwidth to remote node
  int type; 	// max bandwidth type
};

ncclTopoNode

ncclTopoNode描述拓扑中的一个节点以及所有连接。成员介绍请看注解。

struct ncclTopoNode {
  int type; 		// 7 types, GPU, PCI, NVS, CPU, NIC, NET 
  int64_t id;  		// node id
  // Type specific data
  union {
    struct {
      int dev; // NVML dev number
      int rank;
      int cudaCompCap;
      int gdrSupport;
    }gpu;
    struct { 
      int dev; // Plugin dev number
      uint64_t asic;
      int port;
      float bw;
      float latency;
      int gdrSupport;
      int collSupport;
      int maxChannels;
    }net;
    struct {
      int arch;
      int vendor;
      int model;
      cpu_set_t affinity;
    }cpu;
    struct {
      uint64_t device;
    }pci;
  };
  
  int nlinks; // Links amount
  struct ncclTopoLink links[NCCL_TOPO_MAX_LINKS]; // Links to other nodes
  
  struct ncclTopoLinkList* paths[NCCL_TOPO_NODE_TYPES]; // Pre-computed paths to GPUs and NICs
  uint64_t used; // Used during search
};

ncclTopoNodeSet and ncclTopoSystem

ncclTopoNodeSet用于描述拓扑中的所有节点和其连接。ncclTopoSystem包含除了包含ncclTopoNodeSet外，还包含远程主机的systemId，以及localhost的maxBw和totalBw。成员介绍请看注解。

struct ncclTopoNodeSet {
  int count; // number of nodes
  struct ncclTopoNode nodes[NCCL_TOPO_MAX_NODES]; // all nodes
};

struct ncclTopoSystem {
  int systemId; // localhost hash
  uint64_t hostHashes[NCCL_TOPO_MAX_NODES]; // system id for different hosts
  int nHosts; // number of hosts
  struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES]; // all nodes in localhost
  float maxBw; // max link bandwidth of localhost
  float totalBw; // total bandwidth of localhost 
};

接口

以下是笔者筛选的TOPO模块的关键接口（有的函数并不定义在topo.h中，这点请注意），其中最重要的莫过于ncclTopoGetSystem函数，该函数系构建ncclTopoSystem的入口，上层函数通过调用ncclTopoSystem构建系统拓扑。其余函数主要为ncclTopoGetSystem函数提供功能接口，如增删查和节点连接等。

// create system topo
ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system);

// basic function for system topo maintainment
ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id);
ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id);
ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int id);
ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float bw);
intermediateRank);

ncclTopoCreateNode

ncclTopoCreateNode函数创建一个ncclTopoNode节点，节点类型和ID由函数调用者提供，创建的节点使用ncclTopoSystem中的静态空间（因此没使用类似malloc等动态内存分配函数），节点的其他所有属性均未定义（对于GPU节点会初始化一个回路链路），节点的地址也会由函数参数传出。详细介绍请看代码注解。

ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) {
  if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) {
    WARN("Error : tried to create too many nodes of type %d", type);
    return ncclInternalError;
  }
  struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count;
  system->nodes[type].count++;
  n->type = type;
  n->id = id;
  if (type == GPU) {
    // Create link to itself (used in some corner cases)
    n->nlinks=1;
    n->links[0].type = LINK_LOC;
    n->links[0].remNode = n;
    n->links[0].bw = LOC_BW;
    n->gpu.dev = NCCL_TOPO_UNDEF;
    n->gpu.rank = NCCL_TOPO_UNDEF;
    n->gpu.cudaCompCap = NCCL_TOPO_UNDEF;
  } else if (type == CPU) {
    n->cpu.arch = NCCL_TOPO_UNDEF;
    n->cpu.vendor = NCCL_TOPO_UNDEF;
    n->cpu.model = NCCL_TOPO_UNDEF;
  } else if (type == NET) {
    n->net.asic = 0ULL;
    n->net.port = NCCL_TOPO_UNDEF;
    n->net.bw = 0.0;
    n->net.latency = 0.0;
  }
  *node = n;
  return ncclSuccess;
}

ncclTopoRemoveNode

ncclTopoRemoveNode将下标为index的type类型的节点删除。删除节点除了需要将此节点从system->nodes中删除外，还需要更新节点之间的连接信息。详细介绍请看代码注解。

ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int index) {
  struct ncclTopoNode* delNode = system->nodes[type].nodes+index;
  // traversal all nodes' links to remove link to delNode
  for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
    free(delNode->paths[t]);
    for (int n=0; n<system->nodes[t].count; n++) {
      struct ncclTopoNode* node = system->nodes[t].nodes+n;
      if (node == delNode) continue;
      // traversal node's links, override delNode and successor move forward
      for (int l=0; l<node->nlinks; l++) {
        while (l<node->nlinks && node->links[l].remNode == delNode) {
          memmove(node->links+l, node->links+l+1, (node->nlinks-l-1)*sizeof(struct ncclTopoLink));
          node->nlinks--;
        }
        if (l<node->nlinks && node->links[l].remNode->type == type && node->links[l].remNode >= delNode) {
          node->links[l].remNode--;
        }
      }
    }
  }
  // delete delNode from system->nodes
  memmove(delNode, delNode+1, (system->nodes[type].count-index-1)*sizeof(struct ncclTopoNode));
  system->nodes[type].count--;
  return ncclSuccess;
}

ncclTopoGetNode

ncclTopoGetNode通过传入type和id查询从system->nodes中获取节点地址。详细介绍请看代码注解。

ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) {
  for (int i=0; i<system->nodes[type].count; i++) {
    if (system->nodes[type].nodes[i].id == id) {
      *node = system->nodes[type].nodes+i;
      return ncclSuccess;
    }
  }
  return ncclSuccess;
}

ncclTopoConnectNodes

ncclTopoConnectNodes增加node和remNode之间的连接，连接类型为type，带宽是bw。注意，添加连接可能是带宽叠加(从代码注释看，带宽叠场景主要针对NVLINK，对应的物理意义便是GPU之间NVLINK的LANE增加一条)。函数最后会对节点node的所有连接进行排序更新，保持带宽高的互联节点在较前面，为后续的逻辑拓扑搜索提供了便利。详细介绍请看代码注解。

// connect node to remNode with a given type and bandwidths
ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float bw) {
  // Aggregate links into higher bw for NVLink
  struct ncclTopoLink* link;
  for (link = node->links; link - node->links != NCCL_TOPO_MAX_LINKS && link->remNode; link++) {
    if (link->remNode == remNode && link->type == type) break;
  }
  if (link - node->links == NCCL_TOPO_MAX_LINKS) {
    WARN("Error : too many Topo links (max %d)", NCCL_TOPO_MAX_LINKS);
    return ncclInternalError;
  }
  // remNode == NULL means no link to remNode, so nlinks++
  if (link->remNode == NULL) node->nlinks++;
  link->type = type;
  link->remNode = remNode;
  link->bw += bw;

  // Sort links in BW descending order
  struct ncclTopoLink linkSave;
  memcpy(&linkSave, link, sizeof(struct ncclTopoLink));
  while (link != node->links) {
    if ((link-1)->bw >= linkSave.bw) break;
    memcpy(link, link-1, sizeof(struct ncclTopoLink));
    link--;
  }
  memcpy(link, &linkSave, sizeof(struct ncclTopoLink));
  return ncclSuccess;

ncclTopoGetSystem

ncclTopoGetSystem是Topo模块最重要的函数，NCCL在初始化过程中会调用此函数获区系统拓扑，以便后续的逻辑拓扑搜索。本地系统通过XML模块完成GPU和NIC的发现，对于支持南向互联的情况，XML数据会序列化后通过Bootstrap网络使得超节点通过AllGather的方式获取所有Peer的XML数据，随后将所有Peer的数据进行合并，最后通过ncclTopoGetSystemFromXml的方式创建ncclTopoSystem。
ncclTopoGetSystemFromXml首先通过ncclTopoAddCpu对每个NUMA节点进行递归地发掘PCI相关设备和NIC设备，此时能所有节点构造完毕，但是节点之间的连接信息还不全面。后续的ncclTopoAddNvLinks，ncclTopoAddC2c以及ncclTopoConnectCpus便是追加连接信息。
最后在系统拓扑构建完成后通过ncclTopoSortSystem对系统拓扑进行排序，目的是使得PCI树中的UPLINK存放在连接数组的末位，方便后续逻辑拓扑计算。
以下是ncclTopoGetSystem的主要函数调用图。

详细介绍请看代码注解。

ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
  struct ncclXml* xml;
  NCCLCHECK(xmlAlloc(&xml, NCCL_TOPO_XML_MAX_NODES));
  
  // able to read pre-defined topo info from xml file
  const char* xmlTopoFile = ncclGetEnv("NCCL_TOPO_FILE");
  if (xmlTopoFile) {
    INFO(NCCL_ENV, "NCCL_TOPO_FILE set by environment to %s", xmlTopoFile);
    NCCLCHECK(ncclTopoGetXmlFromFile(xmlTopoFile, xml, 1));
  } else {
    // Try default XML topology location
    NCCLCHECK(ncclTopoGetXmlFromFile("/var/run/nvidia-topologyd/virtualTopology.xml", xml, 0));
  }
  
  if (xml->maxIndex == 0) {
    // Create top tag
    struct ncclXmlNode* top;
    NCCLCHECK(xmlAddNode(xml, NULL, "system", &top));
    NCCLCHECK(xmlSetAttrInt(top, "version", NCCL_TOPO_XML_VERSION));
  }

  // Auto-detect GPUs if needed
  for (int r=0; r<comm->nRanks; r++) {
    // on same host
    if (comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) { 
      char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
      NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
      struct ncclXmlNode* node;
      NCCLCHECK(ncclTopoFillGpu(xml, busId, &node));
      if (node == NULL) continue;
      NCCLCHECK(xmlSetAttrInt(node, "keep", 1));
      NCCLCHECK(xmlSetAttrInt(node, "rank", r));
      NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport));
    }
  }
  
  // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
  // so we start with collnet so that it has precedence.
  int netDevCount = 0;
  if (collNetSupport(comm)) {
    NCCLCHECK(collNetDevices(comm, &netDevCount));
    for (int n=0; n<netDevCount; n++) {
      ncclNetProperties_t props;
      NCCLCHECK(collNetGetProperties(comm, n, &props));
      struct ncclXmlNode* netNode;
      NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
      NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
      NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
      NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
      NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
      NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
      NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
      bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
      INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
      NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
      NCCLCHECK(xmlInitAttrInt(netNode, "coll", 1));
    }
  }
  if (netDevCount == 0) {
    NCCLCHECK(comm->ncclNet->devices(&netDevCount));
  }
  for (int n=0; n<netDevCount; n++) {
    ncclNetProperties_t props;
    NCCLCHECK(comm->ncclNet->getProperties(n, &props));
    struct ncclXmlNode* netNode;
    NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
    NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
    NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
    NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
    NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
    NCCLCHECK(xmlInitAttrFloat(netNode, "latency", props.latency));
    NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
    NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
    bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
    INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
    NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
  }

  // Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
  NCCLCHECK(ncclTopoTrimXml(xml));

  // merge multiple host into one system if south gpu connection supported in hardware
  if (comm->MNNVL) {
    // MNNVL clique support
    char* mem;
    NCCLCHECK(ncclCalloc(&mem, comm->clique.size * xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
    struct ncclXml* rankXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*comm->cliqueRank);
    memcpy(rankXml, xml, xmlMemSize(NCCL_TOPO_XML_MAX_NODES));
    NCCLCHECK(ncclTopoConvertXml(rankXml, (uintptr_t)xml->nodes, 1));
    NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->clique.ranks, comm->cliqueRank, comm->clique.size, mem, xmlMemSize(NCCL_TOPO_XML_MAX_NODES)));
    struct ncclXml* cliqueXml;
    NCCLCHECK(xmlAlloc(&cliqueXml, comm->clique.size*NCCL_TOPO_XML_MAX_NODES));
    for (int i = 0; i < comm->clique.size; i++) {
      struct ncclXml* peerXml = (struct ncclXml*)(mem+xmlMemSize(NCCL_TOPO_XML_MAX_NODES)*i);
      NCCLCHECK(ncclTopoConvertXml(peerXml, (uintptr_t)peerXml->nodes, 0));
      NCCLCHECK(ncclTopoFuseXml(cliqueXml, peerXml));
    }
    free(xml);
    xml = cliqueXml;
  }

  xmlTopoFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE");
  if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
    INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile);
    NCCLCHECK(ncclTopoDumpXmlToFile(xmlTopoFile, xml));
  }

  // generate ncclTopoSystem using ncclXml
  NCCLCHECK(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash));
  free(xml);
  return ncclSuccess;
}

ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem, const uint64_t localHostHash) {
  NCCLCHECK(ncclCalloc(topoSystem, 1));
  struct ncclTopoSystem* system = *topoSystem;
  struct ncclXmlNode* topNode;
  NCCLCHECK(xmlFindTag(xml, "system", &topNode));
  
  // Create system topo for each numa node
  for (int s=0; s<topNode->nSubs; s++) {
    struct ncclXmlNode* node = topNode->subs[s];
    if (strcmp(node->name, "cpu") == 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem));
  }
  for (int systemId=0; systemId<system->nHosts; systemId++) if (system->hostHashes[systemId] == localHostHash) system->systemId = systemId;

  // Add NVLinks for GPU-GPU or CPU-GPU connections
  NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL, 0));

  // Add C2C links for GPU-CPU connections
  NCCLCHECK(ncclTopoAddC2c(topNode, *topoSystem, NULL, 0));

  // Flatten BCM switches
  NCCLCHECK(ncclTopoFlattenBcmSwitches(*topoSystem));

  // Connect all CPU nodes together
  NCCLCHECK(ncclTopoConnectCpus(*topoSystem));

  // Sort the system
  NCCLCHECK(ncclTopoSortSystem(*topoSystem));

  return ncclSuccess;
}

标签：NCCL,node,struct,int,拓扑,Topo,NCCLCHECK,type
From： https://blog.csdn.net/weixin_42371021/article/details/140261679

NCCL拓扑管理 - Topo模块

链路