NCCL源码解读的视频在这:NCCL集合通信源码解读、案例、任务调度、拓扑_哔哩哔哩_bilibili
一、环境变量设置
1.1 命令行环境变量设置
为了打印XML拓扑文件,需要设置NCCL的环境变量NCCL_TOPO_DUMP_FILE。这个环境变量指定了XML拓扑文件的输出路径和文件名。例如,可以在命令行中设置此环境变量:
export NCCL_TOPO_DUMP_FILE=/path/to/output/topo.xml
1.2 代码中环境变量设置
或者,在代码中设置此环境变量(假设使用的是C/C++):
setenv("NCCL_TOPO_DUMP_FILE", "/path/to/output/topo.xml", 1);
#include <stdlib.h>
#include <nccl.h>int main() {// 设置环境变量setenv("NCCL_TOPO_DUMP_FILE", "/path/to/output/topo.xml", 1);// 初始化NCCL通信器(这里以单GPU为例)ncclComm_t comm;int nDevices = 1;cudaSetDevice(0); // 设置CUDA设备ncclCommInitAll(comm, nDevices, NULL);// ... 进行其他NCCL操作 ...// 销毁NCCL通信器ncclCommDestroy(comm);return 0;
}
二、生成xml拓扑文件
按照1.1中的方法设置了环境变量,然后运行NCCL-tests,成功生成topo.xml文件
NCCL-tests的安装和使用教程请参考:
三、xml拓扑文件
<system version="1"><cpu numaid="0" affinity="0000,000fffff,ff000000,0fffffff" arch="x86_64" vendor="GenuineIntel" familyid="6" modelid="106"><pci busid="0000:31:00.0" class="0x030200" vendor="0x10de" device="0x1eb8" subsystem_vendor="0x10de" subsystem_device="0x12a2" link_speed="8 GT/s" link_width="16"><gpu dev="0" sm="75" rank="0" gdr="1"/></pci><nic><net name="eth0" dev="0" speed="10000" port="0" latency="0.000000" guid="0x0" maxconn="65536" gdr="0"/></nic></cpu>
</system>
四、拓扑文件保存代码
1、如果设置了环境变量NCCL_TOPO_DUMP_FILE调用保存拓扑文件函数ncclTopoDumpXmlToFile
xmlTopoFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE");if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile);NCCLCHECK(ncclTopoDumpXmlToFile(xmlTopoFile, xml));}
2、ncclTopoDumpXmlToFile中打开文件,并调用ncclTopoDumpXmlRec向文件中写入内容
ncclResult_t ncclTopoDumpXmlToFile(const char* xmlTopoFile, struct ncclXml* xml) {FILE* file = fopen(xmlTopoFile, "w");if (file == NULL) {WARN("Unable to open %s, not dumping topology.", xmlTopoFile);return ncclSuccess;}NCCLCHECK(ncclTopoDumpXmlRec(0, file, xml->nodes));fclose(file);return ncclSuccess;
}
3、ncclTopoDumpXmlRec向文件中写入内容
ncclResult_t ncclTopoDumpXmlRec(int indent, FILE* file, struct ncclXmlNode* node) {for (int i=0; i<indent; i++) fprintf(file, " ");fprintf(file, "<%s", node->name);for (int a=0; a<node->nAttrs; a++) {fprintf(file, " %s=\"%s\"", node->attrs[a].key, node->attrs[a].value);}if (node->nSubs == 0) {fprintf(file, "/>\n");} else {fprintf(file, ">\n");for (int s=0; s<node->nSubs; s++) {NCCLCHECK(ncclTopoDumpXmlRec(indent+2, file, node->subs[s]));}for (int i=0; i<indent; i++) fprintf(file, " ");fprintf(file, "</%s>\n", node->name);}return ncclSuccess;
}