1. 制作 hccl-test 镜像
Python-3.8.18.tgz
Ascend-cann-toolkit_8.0.RC2_linux-x86_64.run
Ascend-cann-kernels-910b_8.0.RC2_linux.run
mpich-3.2.1.tar.gz
如果不方便下载,也可以直接从我打包的镜像中拷贝出来。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
| FROM ubuntu:22.04
WORKDIR /home
RUN sed -i -e 's/^APT/# APT/' -e 's/^DPkg/# DPkg/' /etc/apt/apt.conf.d/docker-clean
RUN apt-get update && apt-get install -y apt-transport-https tzdata openssh-server
# Install base library
RUN apt-get install -y gcc g++ make cmake zlib1g zlib1g-dev openssl libsqlite3-dev libssl-dev libffi-dev libbz2-dev libxslt1-dev unzip pciutils net-tools libblas-dev gfortran libblas3 liblzma-dev git wget zip vim gnutls-bin
# Install Python3.8.18
COPY Python-3.8.18.tgz .
RUN tar -zxvf Python-3.8.18.tgz \
&& cd Python-3.8.18 \
&& chmod +x ./configure
RUN cd Python-3.8.18 && ./configure --prefix=/usr/local/python3.8.18 --enable-loadable-sqlite-extensions --enable-shared --enable-optimizations
RUN cd Python-3.8.18 && make -j16 && make install
RUN cd /home \
&& ln -s /usr/local/python3.8.18/bin/pip3 /usr/bin/pip \
&& ln -s /usr/local/python3.8.18/bin/python3 /usr/bin/python
ENV LD_LIBRARY_PATH=/usr/local/python3.8.18/lib:$LD_LIBRARY_PATH
ENV PATH=/usr/local/python3.8.18/bin:$PATH
# Install ascend-toolkit & kernels
COPY Ascend-cann-toolkit_8.*.run .
COPY Ascend-cann-kernels-910b_8.*.run .
RUN ./Ascend-cann-toolkit_8.*.run --install --quiet
RUN ./Ascend-cann-kernels-910b_8.*.run --install --quiet
RUN echo "source /usr/local/Ascend/ascend-toolkit/set_env.sh" >> ~/.bashrc
# Set Env
ENV TOOLKIT_PATH=/usr/local/Ascend/ascend-toolkit/latest
ENV LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/driver/:$LD_LIBRARY_PATH
ENV GLOG_v=2 \
LD_LIBRARY_PATH=$TOOLKIT_PATH/lib64:$LD_LIBRARY_PATH \
TBE_IMPL_PATH=$TOOLKIT_PATH/opp/op_impl/built-in/ai_core/tbe \
PATH=$TOOLKIT_PATH/ccec_compiler/bin:$PATH \
ASCEND_OPP_PATH=$TOOLKIT_PATH/opp \
ASCEND_AICPU_PATH=$TOOLKIT_PATH
ENV PYTHONPATH=$TBE_IMPL_PATH:$PYTHONPATH
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/mpich-3.2.1/lib:$TOOLKIT_PATH/lib64:$TOOLKIT_PATH/x86_64-linux/devlib
# Install hccl_test
COPY mpich-3.2.1.tar.gz .
RUN umask 0022 && \
tar -zxvf mpich-3.2.1.tar.gz && \
cd mpich-3.2.1 && \
./configure --disable-fortran --prefix=/usr/local/mpich-3.2.1 && \
make && \
make install && \
cd $TOOLKIT_PATH/tools/hccl_test && \
make MPI_HOME=/usr/local/mpich-3.2.1 ASCEND_DIR=$TOOLKIT_PATH/x86_64-linux
ENV PATH=$PATH:/usr/local/mpich-3.2.1/bin:$TOOLKIT_PATH/tools/hccl_test/bin
|
1
| docker build -t hubimage/hccl-test:8.0.RC2-ubuntu22.04 -f Dockerfile .
|
1
| docker push hubimage/hccl-test:8.0.RC2-ubuntu22.04
|
2. 运行 Volcano Job
1
2
3
| kubectl label node ascend-910b-01 hccl-test=true
kubectl label node ascend-910b-02 hccl-test=true
kubectl label node ascend-910b-03 hccl-test=true
|
1
2
| export HCCL_TEST_IMAGE=hubimage/hccl-test:8.0.RC2-ubuntu22.04
export HCCL_TEST_NODES=3
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
| cat <<EOF | kubectl apply -f -
apiVersion: batch.volcano.sh/v1alpha1
kind: Job
metadata:
name: hccl-test
spec:
minAvailable: $HCCL_TEST_NODES
schedulerName: volcano
queue: default
priorityClassName: extreme-high-priority-preempting
policies:
- event: PodEvicted
action: RestartJob
plugins:
env: []
svc: []
ssh: []
tasks:
- replicas: $HCCL_TEST_NODES
name: hccl-test
policies:
- event: TaskCompleted
action: CompleteJob
template:
metadata:
labels:
app: hccl-test
spec:
containers:
- command:
- /bin/sh
- -c
- |
mkdir -p /var/run/sshd; /usr/sbin/sshd;
sleep infinity
image: $HCCL_TEST_IMAGE
imagePullPolicy: Always
name: hccl-test
resources:
requests:
cpu: 1
huawei.com/Ascend910: 16
limits:
huawei.com/Ascend910: 16
nodeSelector:
hccl-test: "true"
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: app
operator: In
values:
- hccl-test
topologyKey: "kubernetes.io/hostname"
EOF
|
3. 运行 hccl-test
1
2
3
4
5
6
| kubectl get pod -l app=hccl-test -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
hccl-test-hccl-test-0 1/1 Running 0 7m19s 10.244.53.163 ascend-910b-02 <none> <none>
hccl-test-hccl-test-1 1/1 Running 0 7m19s 10.244.54.38 ascend-910b-01 <none> <none>
hccl-test-hccl-test-2 1/1 Running 0 7m19s 10.244.52.160 ascend-910b-03 <none> <none>
|
1
| kubectl exec -it hccl-test-hccl-test-0 bash
|
1
2
3
4
| export HCCL_RDMA_TC=100
export HCCL_RDMA_SL=3
export HCCL_BUFFSIZE=2048
mpirun -n 16 /usr/local/Ascend/ascend-toolkit/latest/tools/hccl_test/bin/all_reduce_test -b 8K -e 64M -f 2 -d fp32 -o sum -p 16
|
1
2
3
4
5
| export HCCL_RDMA_TC=100
export HCCL_RDMA_SL=3
export HCCL_BUFFSIZE=2048
mpirun -n 32 -hosts hccl-test-hccl-test-0.hccl-test:16,hccl-test-hccl-test-0.hccl-test:16,hccl-test-hccl-test-0.hccl-test:16 all_reduce_test -b 8K -e 4G -f 2 -d fp32 -o sum -p 16 -c 0
|
4. 清理环境
1
| kubectl delete job.batch.volcano.sh hccl-test
|