Please enable Javascript to view the contents

使用 Volcano 运行 hccl-test

 ·  ☕ 2 分钟

1. 制作 hccl-test 镜像

  • 下载依赖包

Python-3.8.18.tgz
Ascend-cann-toolkit_8.0.RC2_linux-x86_64.run
Ascend-cann-kernels-910b_8.0.RC2_linux.run
mpich-3.2.1.tar.gz

如果不方便下载,也可以直接从我打包的镜像中拷贝出来。

  • 编写 Dockerfile
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
FROM ubuntu:22.04

WORKDIR /home

RUN sed -i -e 's/^APT/# APT/' -e 's/^DPkg/# DPkg/' /etc/apt/apt.conf.d/docker-clean
RUN apt-get update && apt-get install -y apt-transport-https tzdata openssh-server

# Install base library
RUN apt-get install -y gcc g++ make cmake zlib1g zlib1g-dev openssl libsqlite3-dev libssl-dev libffi-dev libbz2-dev libxslt1-dev unzip pciutils net-tools libblas-dev gfortran libblas3 liblzma-dev git wget zip vim gnutls-bin

# Install Python3.8.18
COPY Python-3.8.18.tgz .
RUN tar -zxvf Python-3.8.18.tgz \
    && cd Python-3.8.18 \
    && chmod +x ./configure
RUN cd Python-3.8.18 && ./configure --prefix=/usr/local/python3.8.18 --enable-loadable-sqlite-extensions --enable-shared  --enable-optimizations
RUN cd Python-3.8.18 && make -j16 && make install
RUN cd /home \
    && ln -s /usr/local/python3.8.18/bin/pip3 /usr/bin/pip \
    && ln -s /usr/local/python3.8.18/bin/python3 /usr/bin/python
ENV LD_LIBRARY_PATH=/usr/local/python3.8.18/lib:$LD_LIBRARY_PATH
ENV PATH=/usr/local/python3.8.18/bin:$PATH

# Install ascend-toolkit & kernels
COPY Ascend-cann-toolkit_8.*.run .
COPY Ascend-cann-kernels-910b_8.*.run .
RUN ./Ascend-cann-toolkit_8.*.run --install --quiet
RUN ./Ascend-cann-kernels-910b_8.*.run --install --quiet
RUN echo "source /usr/local/Ascend/ascend-toolkit/set_env.sh" >> ~/.bashrc

# Set Env
ENV TOOLKIT_PATH=/usr/local/Ascend/ascend-toolkit/latest
ENV LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/driver/:$LD_LIBRARY_PATH

ENV GLOG_v=2 \
    LD_LIBRARY_PATH=$TOOLKIT_PATH/lib64:$LD_LIBRARY_PATH \
    TBE_IMPL_PATH=$TOOLKIT_PATH/opp/op_impl/built-in/ai_core/tbe \
    PATH=$TOOLKIT_PATH/ccec_compiler/bin:$PATH \
    ASCEND_OPP_PATH=$TOOLKIT_PATH/opp \
    ASCEND_AICPU_PATH=$TOOLKIT_PATH

ENV PYTHONPATH=$TBE_IMPL_PATH:$PYTHONPATH
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/mpich-3.2.1/lib:$TOOLKIT_PATH/lib64:$TOOLKIT_PATH/x86_64-linux/devlib

# Install hccl_test
COPY mpich-3.2.1.tar.gz .
RUN umask 0022 && \
    tar -zxvf mpich-3.2.1.tar.gz && \
    cd mpich-3.2.1 && \
    ./configure --disable-fortran --prefix=/usr/local/mpich-3.2.1 && \
    make && \
    make install && \
    cd $TOOLKIT_PATH/tools/hccl_test && \
    make MPI_HOME=/usr/local/mpich-3.2.1 ASCEND_DIR=$TOOLKIT_PATH/x86_64-linux

ENV PATH=$PATH:/usr/local/mpich-3.2.1/bin:$TOOLKIT_PATH/tools/hccl_test/bin
  • 编译 hccl-test 镜像
1
docker build -t hubimage/hccl-test:8.0.RC2-ubuntu22.04 -f Dockerfile .
  • 推送 hccl-test 镜像
1
docker push hubimage/hccl-test:8.0.RC2-ubuntu22.04

2. 运行 Volcano Job

  • 给测试节点打上标签
1
2
3
kubectl label node ascend-910b-01 hccl-test=true
kubectl label node ascend-910b-02 hccl-test=true
kubectl label node ascend-910b-03 hccl-test=true
  • 创建 Volcano Job
1
2
export HCCL_TEST_IMAGE=hubimage/hccl-test:8.0.RC2-ubuntu22.04
export HCCL_TEST_NODES=3
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
cat <<EOF | kubectl apply -f -
apiVersion: batch.volcano.sh/v1alpha1
kind: Job
metadata:
  name: hccl-test
spec:
  minAvailable: $HCCL_TEST_NODES
  schedulerName: volcano
  queue: default
  priorityClassName: extreme-high-priority-preempting
  policies:
    - event: PodEvicted
      action: RestartJob
  plugins:
    env: []
    svc: []
    ssh: []
  tasks:
    - replicas: $HCCL_TEST_NODES
      name: hccl-test
      policies:
      - event: TaskCompleted
        action: CompleteJob
      template:
        metadata:
          labels:
            app: hccl-test
        spec:
          containers:
            - command:
              - /bin/sh
              - -c
              - |
                mkdir -p /var/run/sshd; /usr/sbin/sshd;
                sleep infinity                
              image: $HCCL_TEST_IMAGE
              imagePullPolicy: Always
              name: hccl-test
              resources:
                requests:
                  cpu: 1
                  huawei.com/Ascend910: 16
                limits:
                  huawei.com/Ascend910: 16
          nodeSelector:
            hccl-test: "true"
          affinity:
            podAntiAffinity:
              requiredDuringSchedulingIgnoredDuringExecution:
                - labelSelector:
                    matchExpressions:
                      - key: app
                        operator: In
                        values:
                        - hccl-test
                  topologyKey: "kubernetes.io/hostname"
EOF

3. 运行 hccl-test

  • 查看 Pod 状态
1
2
3
4
5
6
kubectl get pod -l app=hccl-test -o wide

NAME                    READY   STATUS    RESTARTS   AGE     IP              NODE             NOMINATED NODE   READINESS GATES
hccl-test-hccl-test-0   1/1     Running   0          7m19s   10.244.53.163   ascend-910b-02   <none>           <none>
hccl-test-hccl-test-1   1/1     Running   0          7m19s   10.244.54.38    ascend-910b-01   <none>           <none>
hccl-test-hccl-test-2   1/1     Running   0          7m19s   10.244.52.160   ascend-910b-03   <none>           <none>
  • 进入 Pod
1
kubectl exec -it hccl-test-hccl-test-0 bash
  • 单机测试
1
2
3
4
export HCCL_RDMA_TC=100
export HCCL_RDMA_SL=3
export HCCL_BUFFSIZE=2048
mpirun -n 16 /usr/local/Ascend/ascend-toolkit/latest/tools/hccl_test/bin/all_reduce_test -b 8K -e 64M -f 2 -d fp32 -o sum -p 16
  • 多节点
1
2
3
4
5
export HCCL_RDMA_TC=100
export HCCL_RDMA_SL=3
export HCCL_BUFFSIZE=2048

mpirun -n 32 -hosts hccl-test-hccl-test-0.hccl-test:16,hccl-test-hccl-test-0.hccl-test:16,hccl-test-hccl-test-0.hccl-test:16 all_reduce_test -b 8K -e 4G -f 2 -d fp32 -o sum -p 16 -c 0

4. 清理环境

1
kubectl delete job.batch.volcano.sh hccl-test

微信公众号
作者
微信公众号