在ncclLaunchPrepare 内,nWorkBudget 是每个内核都有**工作预算限制。**并且说了Drain coll tasks first. This is essential since we partition tasks based on the work budget and p2p work isn’t collective. If we were to drain p2p first, the place where we cut the kernel could vary by rank which would cause the “shortest channel first” channel picker to have divergent results. 先P2P会破坏kernel执行时使用的channel,P2P会用不同的channel,通道的负载量chans[c].collBytes 会增加,会导致所有rank的通道负载不同。
ncclLaunchPrepare
ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) { if(ncclParamPassSm() && tasks->nTasksP2p > 0) { NCCLCHECK(preScheduleP2pUseNokernel(comm)); } if (tasks->nTasksColl + tasks->nTasksP2p != 0) { do { struct ncclKernelPlan* plan = ncclMemoryPoolAlloc<struct ncclKernelPlan>(&comm->memPool_ncclKernelPlan, &comm->memPermanent); ncclIntruQueueEnqueue(&comm->planQueue, plan); nPlans += 1; plan->comm = comm; plan->reclaimer.fn = reclaimPlan; plan->persistent = persistent; // Non-persistent kernels fill up at most half of our fifo per kernel. int nWorkBudget = plan->persistent ? INT_MAX : comm->workFifoDepth/2; int nWorkBudgetOld = nWorkBudget; // Drain coll tasks first. This is essential since we partition tasks based // on the work budget and p2p work isn't collective. If we were to drain p2p // first, the place where we cut the kernel could vary by rank which would // cause the "shortest channel first" channel picker to have divergent results. if (tasks->nTasksColl != 0) { NCCLCHECKGOTO(scheduleCollTasksToPlan(comm, plan, &nWorkBudget), result, failure); } // And only drain p2p tasks once colls are depleted. if (tasks->nTasksColl == 0 && tasks->nTasksP2p != 0) { NCCLCHECKGOTO(scheduleP2pTasksToPlan(comm, plan, &nWorkBudget), result, failure); }}
// Choose the `nBid` least loaded channels to do the work. This ensures // all bids go to different channels in case they need to synchronize. least[0] = 0; maxIndexInLeast = 0; maxBytesInLeast = chans[0].collBytes; // Initialize least[] such that the first nBid channels are accounted for.
// Poll for callbacks sent to us from other threads. Typically these free// resources from to our memory pools.NCCLCHECK(ncclCommPollCallbacks(comm, /*waitSome=*/false));// We already have one frame present which holds all of our tasks (which we// are about to schedule). Now push an additional frame for allocating// work structs (see appendWorkElem() variants all use scoped allocation).ncclMemoryStackPush(&comm->memScoped);
struct ncclKernelPlan* planHead = ncclIntruQueueHead(&comm->planQueue);comm->unlaunchedPlansHead = planHead;cudaStream_t launchStream = tasks->streams->stream;NCCLCHECKGOTO(ncclStrongStreamAcquire(tasks->capturingGraph, &comm->sharedRes->deviceStream), result, failure);// Create dependency for device stream on user streams. First from extra user// streams to deviceStream. Then deviceStream to first user stream.for (struct ncclCudaStreamList* l=tasks->streams->next; l != nullptr; l = l->next) { NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, &comm->sharedRes->deviceStream, l->stream), result, failure);}NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->sharedRes->deviceStream), result, failure);
if (persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking) { // We have to launch host tasks to push proxy args. We are careful to only // do this if necessary since host tasks impose a high performance cost in CUDA. bool acquired = false; for (struct ncclKernelPlan* plan=planHead; plan != nullptr; plan = plan->next) { if (plan->hasProxyOps) { if (!acquired) { acquired = true; NCCLCHECKGOTO(ncclStrongStreamAcquire(tasks->capturingGraph, &comm->sharedRes->hostStream), result, failure); } NCCLCHECKGOTO(ncclStrongStreamLaunchHost(tasks->capturingGraph, &comm->sharedRes->hostStream, hostStreamPlanCallback, plan), result, failure); } } if (acquired) { // Make to-be-launched kernels dependent on just-launched host stream tasks. NCCLCHECKGOTO(ncclStrongStreamWaitStream(tasks->capturingGraph, launchStream, &comm->sharedRes->hostStream), result, failure); NCCLCHECKGOTO(ncclStrongStreamRelease(tasks->capturingGraph, &comm->sharedRes->hostStream), result, failure); }}if (persistent) { comm->persistentRefs += nPlans; NCCLCHECKGOTO(ncclCudaGraphAddDestructor(tasks->capturingGraph, persistentDestructor, (void*)planHead), result, failure);}
if (false) {failure: ncclMemoryStackPop(&comm->memScoped); // deallocate ncclWork's}
失败的话就清理刚刚的内存栈帧。至此ncclLaunchPrepare结束。
**ncclLaunchKernelBefore**建立cleanup的"回调契约"
ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan) { // This code is called after we've checked in to the intra-process barrier // but before launching the kernel. We are not allowed to call CUDA unless the // kernel launch is captured. NCCLCHECK(uploadWork(comm, plan)); return ncclSuccess;}
node201:709393:712147 [1] NCCL INFO New proxy send connection 96 from local rank 1, transport 2node201:709393:712147 [1] NCCL INFO proxyProgressAsync opId=0x7f0215c09cc0 op.type=1 op.reqBuff=0x7f01c409c740 op.respSize=16 donenode201:709393:712006 [1] NCCL INFO ncclPollProxyResponse Received new opId=0x7f0215c09cc0node201:709393:712006 [1] NCCL INFO resp.opId=0x7f0215c09cc0 matches expected opId=0x7f0215c09cc0node201:709393:712147 [1] NCCL INFO Received and initiated operation=Init res=0node201:709393:712006 [1] NCCL INFO Connected to proxy localRank 1 -> connection 0x7f01c4008540node201:709393:712147 [1] NCCL INFO proxyProgressAsync opId=0x7f0215c09cc0 op.type=2 op.reqBuff=0x7f01c409dcb0 op.respSize=0 donenode201:709393:712006 [1] NCCL INFO ncclPollProxyResponse Received new opId=0x7f0215c09cc0node201:709393:712006 [1] NCCL INFO resp.opId=0x7f0215c09cc0 matches expected opId=0x7f0215c09cc0node201:709393:712147 [1] NCCL INFO Received and initiated operation=SharedInit res=0有的还打印出了node201:709393:712147 [1] NCCL INFO Received and initiated operation=SharedInit res=0...node201:709392:712151 [0] NCCL INFO New proxy recv connection 73 from local rank 0, transport 0node201:709392:712151 [0] NCCL INFO proxyProgressAsync opId=0x7f9ea0007c20 op.type=1 op.reqBuff=0x7fa4f400ca30 op.respSize=16 donenode201:709392:712151 [0] NCCL INFO Received and initiated operation=Init res=0node201:709392:712320 [0] NCCL INFO ncclPollProxyResponse Received new opId=0x7f9ea0007c20node201:709392:712320 [0] NCCL INFO resp.opId=0x7f9ea0007c20 matches expected opId=0x7f9ea0007c20node201:709392:712320 [0] NCCL INFO Connected to proxy localRank 0 -> connection 0x7fa4f4007850node201:709393:712147 [1] NCCL INFO New proxy send connection 97 from local rank 1, transport 0node201:709393:712147 [1] NCCL INFO proxyProgressAsync opId=0x7efbdc007c20 op.type=1 op.reqBuff=0x7f01c400ca30 op.respSize=16 donenode201:709393:712147 [1] NCCL INFO Received and initiated operation=Init res=0node201:709393:712321 [1] NCCL INFO ncclPollProxyResponse Received new opId=0x7efbdc007c20node201:709393:712321 [1] NCCL INFO resp.opId=0x7efbdc007c20 matches expected opId=0x7efbdc007c20node201:709393:712321 [1] NCCL INFO Connected to proxy localRank 1 -> connection 0x7f01c40085d0node201:709394:712148 [2] NCCL INFO New proxy send connection 97 from local rank 2, transport 0node201:709394:712148 [2] NCCL INFO proxyProgressAsync opId=0x7f1f6c007c20 op.type=1 op.reqBuff=0x7f254c00ca30 op.respSize=16 donenode201:709394:712148 [2] NCCL INFO Received and initiated operation=Init res=0node201:709394:712322 [2] NCCL INFO ncclPollProxyResponse Received new opId=0x7f1f6c007c20node201:709394:712322 [2] NCCL INFO resp.opId=0x7f1f6c007c20 matches expected opId=0x7f1f6c007c20node201:709394:712322 [2] NCCL INFO Connected to proxy localRank 2 -> connection 0x7f254c0085d0node201:709397:712138 [5] NCCL INFO New proxy send connection 97 from local rank 5, transport 0node201:709397:712138 [5] NCCL INFO proxyProgressAsync opId=0x7f9c24007c20 op.type=1 op.reqBuff=0x7fa20000ca30 op.respSize=16 donenode201:709397:712138 [5] NCCL INFO Received and initiated operation=Init res=0node201:709397:712325 [5] NCCL INFO ncclPollProxyResponse Received new opId=0x7f9c24007c20node201:709397:712325 [5] NCCL INFO resp.opId=0x7f9c24007c20 matches expected opId=0x7f9c24007c20node201:709397:712325 [5] NCCL INFO Connected to proxy localRank 5 -> connection 0x7fa2000085d0node201:709396:712145 [4] NCCL INFO New proxy send connection 97 from local rank 4, transport 0node201:709396:712145 [4] NCCL INFO proxyProgressAsync opId=0x7fdd74007c20 op.type=1 op.reqBuff=0x7fe35800ca30 op.respSize=16 donenode201:709396:712145 [4] NCCL INFO Received and initiated operation=Init res=0node201:709396:712324 [4] NCCL INFO ncclPollProxyResponse Received new opId=0x7fdd74007c20node201:709396:712324 [4] NCCL INFO resp.opId=0x7fdd74007c20 matches expected opId=0x7fdd74007c20node201:709396:712324 [4] NCCL INFO Connected to proxy localRank 4 -> connection 0x7fe3580085d0node201:709398:712141 [6] NCCL INFO New proxy send connection 97 from local rank 6, transport 0node201:709398:712141 [6] NCCL INFO proxyProgressAsync opId=0x7fcd64007c20 op.type=1 op.reqBuff=0x7fd35000ca30 op.respSize=16 donenode201:709398:712141 [6] NCCL INFO Received and initiated operation=Init res=0node201:709398:712326 [6] NCCL INFO ncclPollProxyResponse Received new opId=0x7fcd64007c20node201:709398:712326 [6] NCCL INFO resp.opId=0x7fcd64007c20 matches expected opId=0x7fcd64007c20node201:709398:712326 [6] NCCL INFO Connected to proxy localRank 6 -> connection 0x7fd3500085d0node201:709395:712137 [3] NCCL INFO New proxy send connection 97 from local rank 3, transport 0node201:709395:712137 [3] NCCL INFO proxyProgressAsync opId=0x7fe250007c20 op.type=1 op.reqBuff=0x7fe83c00ca30 op.respSize=16 donenode201:709395:712137 [3] NCCL INFO Received and initiated operation=Init res=0node201:709395:712323 [3] NCCL INFO ncclPollProxyResponse Received new opId=0x7fe250007c20node201:709395:712323 [3] NCCL INFO resp.opId=0x7fe250007c20 matches expected opId=0x7fe250007c20node201:709395:712323 [3] NCCL INFO Connected to proxy localRank 3 -> connection 0x7fe83c0085d0node201:709399:712139 [7] NCCL INFO New proxy send connection 73 from local rank 7, transport 0node201:709399:712139 [7] NCCL INFO proxyProgressAsync opId=0x7f0ae0007c20 op.type=1 op.reqBuff=0x7f10bc00ca30 op.respSize=16 donenode201:709399:712139 [7] NCCL INFO Received and initiated operation=Init res=0node201:709399:712327 [7] NCCL INFO ncclPollProxyResponse Received new opId=0x7f0ae0007c20node201:709399:712327 [7] NCCL INFO resp.opId=0x7f0ae0007c20 matches expected opId=0x7f0ae0007c20node201:709399:712327 [7] NCCL INFO Connected to proxy localRank 7 -> connection 0x7f10bc007850node201:709392:712151 [0] NCCL INFO proxyProgressAsync opId=0x7f9ea0007c20 op.type=3 op.reqBuff=0x7fa4f407b370 op.respSize=80 donenode201:709392:712320 [0] NCCL INFO ncclPollProxyResponse Received new opId=0x7f9ea0007c20node201:709392:712151 [0] NCCL INFO Received and initiated operation=Setup res=0node201:709392:712320 [0] NCCL INFO resp.opId=0x7f9ea0007c20 matches expected opId=0x7f9ea0007c20node201:709399:712139 [7] NCCL INFO proxyProgressAsync opId=0x7f0ae0007c20 op.type=3 op.reqBuff=(nil) op.respSize=240 donenode201:709399:712327 [7] NCCL INFO ncclPollProxyResponse Received new opId=0x7f0ae0007c20node201:709399:712139 [7] NCCL INFO Received and initiated operation=Setup res=0node201:709399:712327 [7] NCCL INFO resp.opId=0x7f0ae0007c20 matches expected opId=0x7f0ae0007c20node201:709396:712145 [4] NCCL INFO proxyProgressAsync opId=0x7fdd74007c20 op.type=3 op.reqBuff=(nil) op.respSize=240 donenode201:709396:712324 [4] NCCL INFO ncclPollProxyResponse Received new opId=0x7fdd74007c20node201:709396:712145 [4] NCCL INFO Received and initiated operation=Setup res=0node201:709396:712324 [4] NCCL INFO resp.opId=0x7fdd74007c20 matches expected opId=0x7fdd74007c20node201:709397:712138 [5] NCCL INFO proxyProgressAsync opId=0x7f9c24007c20 op.type=3 op.reqBuff=(nil) op.respSize=240 donenode201:709397:712325 [5] NCCL INFO ncclPollProxyResponse Received new opId=0x7f9c24007c20node201:709397:712138 [5] NCCL INFO Received and initiated operation=Setup res=0node201:709397:712325 [5] NCCL INFO resp.opId=0x7f9c24007c20 matches expected opId=0x7f9c24007c20node201:709399:712327 [7] NCCL INFO ProxyCall UDS comm 0x55f0ee791ec0 rank 7 tpRank 0(64b7c3a0585db18e) reqSize 8 respSize 0 respFd 0x7f0ae9ffebe0 opId 0xc38d1761f4ed51d7node201:709393:712147 [1] NCCL INFO proxyProgressAsync opId=0x7efbdc007c20 op.type=3 op.reqBuff=(nil) op.respSize=240 donenode201:709393:712321 [1] NCCL INFO ncclPollProxyResponse Received new opId=0x7efbdc007c20node201:709393:712147 [1] NCCL INFO Received and initiated operation=Setup res=0node201:709393:712321 [1] NCCL INFO resp.opId=0x7efbdc007c20 matches expected opId=0x7efbdc007c20node201:709394:712148 [2] NCCL INFO proxyProgressAsync opId=0x7f1f6c007c20 op.type=3 op.reqBuff=(nil) op.respSize=240 donenode201:709394:712322 [2] NCCL INFO ncclPollProxyResponse Received new opId=0x7f1f6c007c20node201:709394:712148 [2] NCCL INFO Received and initiated operation=Setup res=0node201:709394:712322 [2] NCCL INFO resp.opId=0x7f1f6c007c20 matches expected opId=0x7f1f6c007c20node201:709392:712151 [0] NCCL INFO New proxy recv connection 74 from local rank 0, transport 0node201:709392:712151 [0] NCCL INFO proxyProgressAsync opId=0x7f9ea0007c20 op.type=1 op.reqBuff=0x7fa4f407b370 op.respSize=16 donenode201:709392:712320 [0] NCCL INFO ncclPollProxyResponse Received new opId=0x7f9ea0007c20node201:709392:712151 [0] NCCL INFO Received and initiated operation=Init res=0node201:709392:712320 [0] NCCL INFO resp.opId=0x7f9ea0007c20 matches expected opId=0x7f9ea0007c20node201:709392:712320 [0] NCCL INFO Connected to proxy localRank 0 -> connection 0x7fa4f40078e0node201:709392:712152 [0] NCCL INFO proxyUDSRecvReq::ncclProxyMsgGetFd rank 7 opId 0xc38d1761f4ed51d7 handle=0x7fa4f407d330node201:709392:712152 [0] NCCL INFO UDS proxyGetFd received handle 0x7fa4f407d330 peer 7 opId c38d1761f4ed51d7node201:709398:712141 [6] NCCL INFO proxyProgressAsync opId=0x7fcd64007c20 op.type=3 op.reqBuff=(nil) op.respSize=240 donenode201:709398:712326 [6] NCCL INFO ncclPollProxyResponse Received new opId=0x7fcd64007c20node201:709398:712141 [6] NCCL INFO Received and initiated operation=Setup res=0node201:709398:712326 [6] NCCL INFO resp.opId=0x7fcd64007c20 matches expected opId=0x7fcd64007c20node201:709395:712137 [3] NCCL INFO proxyProgressAsync opId=0x7fe250007c20 op.type=3 op.reqBuff=(nil) op.respSize=240 donenode201:709395:712323 [3] NCCL INFO ncclPollProxyResponse Received new opId=0x7fe250007c20node201:709395:712137 [3] NCCL INFO Received and initiated operation=Setup res=0node201:709395:712323 [3] NCCL INFO resp.opId=0x7fe250007c20 matches expected opId=0x7fe250007c20node201:709399:712327 [7] NCCL INFO ProxyCall UDS comm 0x55f0ee791ec0 rank 7 tpRank 0(64b7c3a0585db18e) reqSize 8 respSize 0 respFd 157 opId 0xc38d1761f4ed51d7 - DONEnode201:709399:712327 [7] NCCL INFO UDS: ClientGetFd handle 0x7fa4f407d330 tpRank 0 returned fd 157node201:709399:712139 [7] NCCL INFO proxyProgressAsync opId=0x7f0ae0007c20 op.type=4 op.reqBuff=0x7f10bc089ae0 op.respSize=0 donenode201:709399:712139 [7] NCCL INFO Received and initiated operation=Connect res=0node201:709399:712327 [7] NCCL INFO ncclPollProxyResponse Received new opId=0x7f0ae0007c20node201:709399:712327 [7] NCCL INFO resp.opId=0x7f0ae0007c20 matches expected opId=0x7f0ae0007c20node201:709392:712151 [0] NCCL INFO proxyProgressAsync opId=0x7f9ea0007c20 op.type=3 op.reqBuff=0x7fa4f407ed70 op.respSize=80 donenode201:709392:712320 [0] NCCL INFO ncclPollProxyResponse Received new opId=0x7f9ea0007c20node201:709392:712151 [0] NCCL INFO Received and initiated operation=Setup res=0node201:709392:712320 [0] NCCL INFO resp.opId=0x7f9ea0007c20 matches expected opId=0x7f9ea0007c20node201:709398:712326 [6] NCCL INFO ProxyCall UDS comm 0x56470487fa70 rank 6 tpRank 0(64b7c3a0585db18e) reqSize 8 respSize 0 respFd 0x7fcd6dffebe0 opId 0xa21d501bb3b45da8node201:709392:712151 [0] NCCL INFO New proxy recv connection 75 from local rank 0, transport 0node201:709392:712151 [0] NCCL INFO proxyProgressAsync opId=0x7f9ea0007c20 op.type=1 op.reqBuff=0x7fa4f407ed70 op.respSize=16 donenode201:709392:712320 [0] NCCL INFO ncclPollProxyResponse Received new opId=0x7f9ea0007c20node201:709392:712151 [0] NCCL INFO Received and initiated operation=Init res=0node201:709392:712320 [0] NCCL INFO resp.opId=0x7f9ea0007c20 matches expected opId=0x7f9ea0007c20node201:709392:712320 [0] NCCL INFO Connected to proxy localRank 0 -> connection 0x7fa4f4007970node201:709392:712152 [0] NCCL INFO proxyUDSRecvReq::ncclProxyMsgGetFd rank 6 opId 0xa21d501bb3b45da8 handle=0x7fa4f407ed90node201:709392:712152 [0] NCCL INFO UDS proxyGetFd received handle 0x7fa4f407ed90 peer 6 opId a21d501bb3b45da8node201:709397:712325 [5] NCCL INFO ProxyCall UDS comm 0x556d5e45ddb0 rank 5 tpRank 0(64b7c3a0585db18e) reqSize 8 respSize 0 respFd 0x7f9c2dffebe0 opId 0x40f06cb450b4d0cnode201:709392:712151 [0] NCCL INFO proxyProgressAsync opId=0x7f9ea0007c20 op.type=3 op.reqBuff=0x7fa4f4080a50 op.respSize=80 donenode201:709392:712320 [0] NCCL INFO ncclPollProxyResponse Received new opId=0x7f9ea0007c20node201:709392:712151 [0] NCCL INFO Received and initiated operation=Setup res=0node201:709392:712320 [0] NCCL INFO resp.opId=0x7f9ea0007c20 matches expected opId=0x7f9ea0007c20node201:709392:712151 [0] NCCL INFO New proxy recv connection 76 from local rank 0, transport 0node201:709392:712151 [0] NCCL INFO proxyProgressAsync opId=0x7f9ea0007c20 op.type=1 op.reqBuff=0x7fa4f4080a50 op.respSize=16 donenode201:709392:712320 [0] NCCL INFO ncclPollProxyResponse Received new opId=0x7f9ea0007c20node201:709392:712151 [0] NCCL INFO Received and initiated operation=Init res=0node201:709392:712320 [0] NCCL INFO resp.opId=0x7f9ea0007c20 matches expected opId=0x7f9ea0007c20node201:709392:712320 [0] NCCL INFO Connected to proxy localRank 0 -> connection 0x7fa4f4007a00node201:709398:712326 [6] NCCL INFO ProxyCall UDS comm 0x56470487fa70 rank 6 tpRank 0(64b7c3a0585db18e) reqSize 8 respSize 0 respFd 157 opId 0xa21d501bb3b45da8 - DONEnode201:709398:712326 [6] NCCL INFO UDS: ClientGetFd handle 0x7fa4f407ed90 tpRank 0 returned fd 157node201:709392:712152 [0] NCCL INFO proxyUDSRecvReq::ncclProxyMsgGetFd rank 5 opId 0x40f06cb450b4d0c handle=0x7fa4f4080a70node201:709392:712152 [0] NCCL INFO UDS proxyGetFd received handle 0x7fa4f4080a70 peer 5 opId 40f06cb450b4d0cnode201:709398:712141 [6] NCCL INFO proxyProgressAsync opId=0x7fcd64007c20 op.type=4 op.reqBuff=0x7fd3500af820 op.respSize=0 donenode201:709398:712141 [6] NCCL INFO Received and initiated operation=Connect res=0node201:709398:712326 [6] NCCL INFO ncclPollProxyResponse Received new opId=0x7fcd64007c20node201:709398:712326 [6] NCCL INFO resp.opId=0x7fcd64007c20 matches expected opId=0x7fcd64007c20node201:709392:712151 [0] NCCL INFO proxyProgressAsync opId=0x7f9ea0007c20 op.type=3 op.reqBuff=0x7fa4f4082730 op.respSize=80 donenode201:709392:712320 [0] NCCL INFO ncclPollProxyResponse Received new opId=0x7f9ea0007c20node201:709392:712151 [0] NCCL INFO Received and initiated operation=Setup res=0node201:709392:712320 [0] NCCL INFO resp.opId=0x7f9ea0007c20 matches expected opId=0x7f9ea0007c20node201:709396:712324 [4] NCCL INFO ProxyCall UDS comm 0x564946ccbfb0 rank 4 tpRank 0(64b7c3a0585db18e) reqSize 8 respSize 0 respFd 0x7fdd7dffebe0 opId 0x9666afda57f1549enode201:709392:712151 [0] NCCL INFO New proxy recv connection 77 from local rank 0, transport 0node201:709392:712151 [0] NCCL INFO proxyProgressAsync opId=0x7f9ea0007c20 op.type=1 op.reqBuff=0x7fa4f4082730 op.respSize=16 donenode201:709392:712320 [0] NCCL INFO ncclPollProxyResponse Received new opId=0x7f9ea0007c20node201:709392:712151 [0] NCCL INFO Received and initiated operation=Init res=0node201:709392:712320 [0] NCCL INFO resp.opId=0x7f9ea0007c20 matches expected opId=0x7f9ea0007c20node201:709392:712320 [0] NCCL INFO Connected to proxy localRank 0 -> connection 0x7fa4f4007a90node201:709392:712152 [0] NCCL INFO proxyUDSRecvReq::ncclProxyMsgGetFd rank 4 opId 0x9666afda57f1549e handle=0x7fa4f4082750node201:709392:712152 [0] NCCL INFO UDS proxyGetFd received handle 0x7fa4f4082750 peer 4 opId 9666afda57f1549enode201:709397:712325 [5] NCCL INFO ProxyCall UDS comm 0x556d5e45ddb0 rank 5 tpRank 0(64b7c3a0585db18e) reqSize 8 respSize 0 respFd 157 opId 0x40f06cb450b4d0c - DONEnode201:709397:712325 [5] NCCL INFO UDS: ClientGetFd handle 0x7fa4f4080a70 tpRank 0 returned fd 157node201:709396:712324 [4] NCCL INFO ProxyCall UDS comm 0x564946ccbfb0 rank 4 tpRank 0(64b7c3a0585db18e) reqSize 8 respSize 0 respFd 157 opId 0x9666afda57f1549e - DONEnode201:709396:712324 [4] NCCL INFO UDS: ClientGetFd handle 0x7fa4f4082750 tpRank 0 returned fd 157node201:709396:712145 [4] NCCL INFO proxyProgressAsync opId=0x7fdd74007c20 op.type=4 op.reqBuff=0x7fe3580af820 op.respSize=0 donenode201:709396:712145 [4] NCCL INFO Received and initiated operation=Connect res=0node201:709396:712324 [4] NCCL INFO ncclPollProxyResponse Received new opId=0x7fdd74007c20node201:709396:712324 [4] NCCL INFO resp.opId=0x7fdd74007c20 matches expected opId=0x7fdd74007c20node201:709397:712138 [5] NCCL INFO proxyProgressAsync opId=0x7f9c24007c20 op.type=4 op.reqBuff=0x7fa2000af820 op.respSize=0 donenode201:709397:712138 [5] NCCL INFO Received and initiated operation=Connect res=0node201:709397:712325 [5] NCCL INFO ncclPollProxyResponse Received new opId=0x7f9c24007c20node201:709397:712325 [5] NCCL INFO resp.opId=0x7f9c24007c20 matches expected opId=0x7f9c24007c20node201:709392:712151 [0] NCCL INFO proxyProgressAsync opId=0x7f9ea0007c20 op.type=3 op.reqBuff=0x7fa4f4084410 op.respSize=80 donenode201:709392:712320 [0] NCCL INFO ncclPollProxyResponse Received new opId=0x7f9ea0007c20node201:709392:712151 [0] NCCL INFO Received and initiated operation=Setup res=0node201:709392:712320 [0] NCCL INFO resp.opId=0x7f9ea0007c20 matches expected opId=0x7f9ea0007c20node201:709395:712323 [3] NCCL INFO ProxyCall UDS comm 0x55e0e4bdc1e0 rank 3 tpRank 0(64b7c3a0585db18e) reqSize 8 respSize 0 respFd 0x7fe259ffebe0 opId 0x72d942102eb468enode201:709392:712152 [0] NCCL INFO proxyUDSRecvReq::ncclProxyMsgGetFd rank 3 opId 0x72d942102eb468e handle=0x7fa4f4084430node201:709392:712152 [0] NCCL INFO UDS proxyGetFd received handle 0x7fa4f4084430 peer 3 opId 72d942102eb468enode201:709392:712151 [0] NCCL INFO New proxy recv connection 78 from local rank 0, transport 0node201:709392:712151 [0] NCCL INFO proxyProgressAsync opId=0x7f9ea0007c20 op.type=1 op.reqBuff=0x7fa4f4084410 op.respSize=16 donenode201:709392:712320 [0] NCCL INFO ncclPollProxyResponse Received new opId=0x7f9ea0007c20node201:709392:712151 [0] NCCL INFO Received and initiated operation=Init res=0node201:709392:712320 [0] NCCL INFO resp.opId=0x7f9ea0007c20 matches expected opId=0x7f9ea0007c20node201:709392:712320 [0] NCCL INFO Connected to proxy localRank 0 -> connection 0x7fa4f4007b20node201:709395:712323 [3] NCCL INFO ProxyCall UDS comm 0x55e0e4bdc1e0 rank 3 tpRank 0(64b7c3a0585db18e) reqSize 8 respSize 0 respFd 157 opId 0x72d942102eb468e - DONEnode201:709395:712323 [3] NCCL INFO UDS: ClientGetFd handle 0x7fa4f4084430 tpRank 0 returned fd 157node201:709392:712151 [0] NCCL INFO proxyProgressAsync opId=0x7f9ea0007c20 op.type=3 op.reqBuff=0x7fa4f40860f0 op.respSize=80 donenode201:709392:712320 [0] NCCL INFO ncclPollProxyResponse Received new opId=0x7f9ea0007c20node201:709392:712151 [0] NCCL INFO Received and initiated operation=Setup res=0node201:709392:712320 [0] NCCL INFO resp.opId=0x7f9ea0007c20 matches expected opId=0x7f9ea0007c20node201:709392:712151 [0] NCCL INFO New proxy recv connection 79 from local rank 0, transport 0node201:709392:712151 [0] NCCL INFO proxyProgressAsync opId=0x7f9ea0007c20 op.type=1 op.reqBuff=0x7fa4f40860f0 op.respSize=16 donenode201:709392:712320 [0] NCCL INFO ncclPollProxyResponse Received new opId=0x7f9ea0007c20node201:709392:712320 [0] NCCL INFO resp.opId=0x7f9ea0007c20 matches expected opId=0x7f9ea0007c20node201:709394:712322 [2] NCCL INFO ProxyCall UDS comm 0x562af065c1b0 rank 2 tpRank 0(64b7c3a0585db18e) reqSize 8 respSize 0 respFd 0x7f1f75ffebe0 opId 0xaecc8df2a08df09anode201:709392:712320 [0] NCCL INFO Connected to proxy localRank 0 -> connection 0x7fa4f4007bb0node201:709392:712152 [0] NCCL INFO proxyUDSRecvReq::ncclProxyMsgGetFd rank 2 opId 0xaecc8df2a08df09a handle=0x7fa4f4086110node201:709392:712152 [0] NCCL INFO UDS proxyGetFd received handle 0x7fa4f4086110 peer 2 opId aecc8df2a08df09anode201:709392:712151 [0] NCCL INFO Received and initiated operation=Init res=0node201:709392:712151 [0] NCCL INFO proxyProgressAsync opId=0x7f9ea0007c20 op.type=3 op.reqBuff=0x7fa4f4087dd0 op.respSize=80 donenode201:709392:712320 [0] NCCL INFO ncclPollProxyResponse Received new opId=0x7f9ea0007c20node201:709392:712151 [0] NCCL INFO Received and initiated operation=Setup res=0node201:709392:712320 [0] NCCL INFO resp.opId=0x7f9ea0007c20 matches expected opId=0x7f9ea0007c20node201:709393:712321 [1] NCCL INFO ProxyCall UDS comm 0x557902a6c560 rank 1 tpRank 0(64b7c3a0585db18e) reqSize 8 respSize 0 respFd 0x7efbe5ffebe0 opId 0x7098875de36c2benode201:709394:712322 [2] NCCL INFO ProxyCall UDS comm 0x562af065c1b0 rank 2 tpRank 0(64b7c3a0585db18e) reqSize 8 respSize 0 respFd 157 opId 0xaecc8df2a08df09a - DONEnode201:709394:712322 [2] NCCL INFO UDS: ClientGetFd handle 0x7fa4f4086110 tpRank 0 returned fd 157node201:709392:712152 [0] NCCL INFO proxyUDSRecvReq::ncclProxyMsgGetFd rank 1 opId 0x7098875de36c2be handle=0x7fa4f4087df0node201:709392:712152 [0] NCCL INFO UDS proxyGetFd received handle 0x7fa4f4087df0 peer 1 opId 7098875de36c2benode201:709395:712137 [3] NCCL INFO proxyProgressAsync opId=0x7fe250007c20 op.type=4 op.reqBuff=0x7fe83c0af820 op.respSize=0 donenode201:709395:712137 [3] NCCL INFO Received and initiated operation=Connect res=0node201:709395:712323 [3] NCCL INFO ncclPollProxyResponse Received new opId=0x7fe250007c20node201:709395:712323 [3] NCCL INFO resp.opId=0x7fe250007c20 matches expected opId=0x7fe250007c20node201:709394:712148 [2] NCCL INFO proxyProgressAsync opId=0x7f1f6c007c20 op.type=4 op.reqBuff=0x7f254c0af820 op.respSize=0 donenode201:709394:712148 [2] NCCL INFO Received and initiated operation=Connect res=0node201:709394:712322 [2] NCCL INFO ncclPollProxyResponse Received new opId=0x7f1f6c007c20node201:709394:712322 [2] NCCL INFO resp.opId=0x7f1f6c007c20 matches expected opId=0x7f1f6c007c20node201:709393:712321 [1] NCCL INFO ProxyCall UDS comm 0x557902a6c560 rank 1 tpRank 0(64b7c3a0585db18e) reqSize 8 respSize 0 respFd 157 opId 0x7098875de36c2be - DONEnode201:709393:712321 [1] NCCL INFO UDS: ClientGetFd handle 0x7fa4f4087df0 tpRank 0 returned fd 157node201:709393:712147 [1] NCCL INFO proxyProgressAsync opId=0x7efbdc007c20 op.type=4 op.reqBuff=0x7f01c40af820 op.respSize=0 donenode201:709393:712147 [1] NCCL INFO Received and initiated operation=Connect res=0node201:709393:712321 [1] NCCL INFO ncclPollProxyResponse Received new opId=0x7efbdc007c20node201:709393:712321 [1] NCCL INFO resp.opId=0x7efbdc007c20 matches expected opId=0x7efbdc007c20