1. 制作 nccl-test 镜像
1
2
3
| nvidia-smi | grep "CUDA Version" | awk '{print $9}'
12.2
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
| cat > Dockerfile << EOF
FROM hubimage/nvidia-cuda:12.1.0-cudnn8-devel-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive
ARG CONDA_VERSION
WORKDIR /workspace
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && apt install -y openmpi-bin libopenmpi-dev ssh openssh-server net-tools vim git iputils-ping nfs-common
RUN git clone https://github.com/NVIDIA/nccl-tests.git && \
cd nccl-tests && \
make MPI=1 MPI_HOME=/usr/lib/x86_64-linux-gnu/openmpi
EOF
|
1
| docker build -t hubimage/nccl-test:12.1.0-ubuntu22.04 -f Dockerfile .
|
1
| docker push hubimage/nccl-test:12.1.0-ubuntu22.04
|
2. 运行 Volcano Job
1
2
3
| kubectl label node node-a100-15 nccl-test=true
kubectl label node node-a100-16 nccl-test=true
kubectl label node node-a100-31 nccl-test=true
|
1
2
| export NCCL_TEST_IMAGE=hubimage/nccl-test:12.1.0-ubuntu22.04
export NCCL_TEST_NODES=3
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
| cat <<EOF | kubectl apply -f -
apiVersion: batch.volcano.sh/v1alpha1
kind: Job
metadata:
name: nccl-test
spec:
minAvailable: $NCCL_TEST_NODES
schedulerName: volcano
queue: default
policies:
- event: PodEvicted
action: RestartJob
plugins:
env: []
svc: []
ssh: []
tasks:
- replicas: $NCCL_TEST_NODES
name: nccl-test
policies:
- event: TaskCompleted
action: CompleteJob
template:
metadata:
labels:
app: nccl-test
spec:
containers:
- command:
- /bin/sh
- -c
- |
mkdir -p /var/run/sshd; /usr/sbin/sshd;
sleep infinity
image: $NCCL_TEST_IMAGE
imagePullPolicy: Always
name: nccl-test
resources:
requests:
cpu: 1
nodeSelector:
nccl-test: "true"
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: app
operator: In
values:
- nccl-test
topologyKey: "kubernetes.io/hostname"
EOF
|
3. 运行 nccl-test
1
2
3
4
5
6
| kubectl get pod -l app=nccl-test -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
nccl-test-nccl-test-0 1/1 Running 0 7m3s 10.244.39.166 node-a100-31 <none> <none>
nccl-test-nccl-test-1 1/1 Running 0 7m3s 10.244.201.46 node-a100-15 <none> <none>
nccl-test-nccl-test-2 1/1 Running 0 7m 10.244.39.185 node-a100-31 <none> <none>
|
1
| kubectl exec -it nccl-test-nccl-test-0 bash
|
1
| export NCCL_TEST_GPUS=8
|
1
2
3
4
5
6
7
8
9
10
11
12
| mpirun --allow-run-as-root -np $VC_NCCL_TEST_NUM --host $VC_NCCL_TEST_HOSTS /workspace/nccl-tests/build/all_reduce_perf -b 8M -e 128M -f 2 -g ${NCCL_TEST_GPUS} -t 1 -a 2 -n 50
# out-of-place in-place
# size count type redop root time algbw busbw #wrong time algbw busbw #wrong
# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s)
8388608 2097152 float sum -1 103954 0.08 0.15 0 113322 0.07 0.14 0
16777216 4194304 float sum -1 135988 0.12 0.24 0 108949 0.15 0.30 0
33554432 8388608 float sum -1 117689 0.29 0.55 0 122374 0.27 0.53 0
67108864 16777216 float sum -1 227753 0.29 0.56 0 227060 0.30 0.57 0
134217728 33554432 float sum -1 479106 0.28 0.54 0 476898 0.28 0.54 0
# Out of bounds values : 0 OK
# Avg bus bandwidth : 0.410777
|
mpirun 命令参数说明:
--allow-run-as-root
: 允许使用 root 运行程序
np
: 执行的总进程数量,这里一个节点一个进程,因此就是节点的数量
host
: 运行程序的节点列表
all_reduce_perf 参数说明:
-b
: 开始的数据大小
-e
: 结束的数据大小
-f
: 每次增加的倍数
-g
: 每个进程的 GPU 数量
-t
: 每个进程的线程数量
-a
: 在所有 ranks 计算均值作为最终结果 (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. 默认为 1.
-n
: 每次操作(一次发送)循环多少次
此外还可以配置 -x NCCL_SOCKET_IFNAME=eth0 -x NCCL_P2P_LEVEL=NVL -x LD_LIBRARY_PATH -x PATH -x NCCL_NET_GDR_LEVEL=4
等通信参数,测试 NVLink、IB 网卡的带宽。
4. 清理环境
1
| kubectl delete job.batch.volcano.sh nccl-test
|
5. 相关问题与解决
报错
1
2
3
| nccl-test-nccl-test-1: Test CUDA failure common.cu:622 'invalid device ordinal'
.. nccl-test-nccl-test-1 pid 717: Test failure common.cu:1078
.. nccl-test-nccl-test-1 pid 717: Test failure common.cu:891
|
没有足够可用 GPU 卡,需要减少 -g
参数。