0. Info

https://github.com/sii-research/VCCL/pull/51

1. usage

build

step
# download and compile VCCL
git clone https://github.com/sii-research/VCCL.git
cd VCCL
git fetch origin pull/51/head:pr-51
git checkout pr-51
make -j64 src.build
 
# download and compile vccl-tests
git clone https://github.com/leoda1/vccl-tests.git
cd vccl-tests
make -j64 MPI=1 MPI_HOME=/usr/local/mpi NCCL_HOME=/path/to/VCCL/build/

test

测试 alltoallv 需要下载上面所述 vccl-tests 测试 sendrecv 及其他正常使用 nccl-tests 也行 测试 nccl4py的功能只需参考 nccl4py

# send/recv
mpirun -np 4 \
  --hosts 10.0.26.138:2,10.0.26.202:2 \
  -x LD_LIBRARY_PATH=/workspace/liuda/VCCL/build/lib:$LD_LIBRARY_PATH \
  -x NCCL_DEBUG=version \
  -x NCCL_PSM_FORCE_ZEROCOPY=1 \
  -x NCCL_PASS_SM=1 \
  -x NCCL_PXN_DISABLE=1 \
  ./build/sendrecv_perf -b 1kB -e 8GB -f 2 -R 1 -g 1
  
# alltoallv
mpirun -np 32 \
  --allow-run-as-root \
  --hostfile hostfile \
  --mca pml ob1 \
  --mca btl self,vader \
  -x OMPI_MCA_coll=^ucc \
  -x LD_LIBRARY_PATH=/inspire/hdd/global_user/huxiaohe-p-huxiaohe/liuda/a2av/build/lib:$LD_LIBRARY_PATH \
  -x NCCL_IB_HCA=="mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1" \
  -x NCCL_DEBUG=version \
  -x CUDA_DEVICE_MAX_CONNECTIONS=32 \
  ./build/alltoallv_perf -b 1KB -e 8GB -f 2 -R 2 -B 1 -g 1