Please enable Javascript to view the contents

Ascend NPU 驱动安装

 ·  ☕ 2 分钟

1. 安装驱动

  • 创建 HwHiAiUser 用户
1
2
groupadd -g 1000 HwHiAiUser
useradd -g HwHiAiUser -u 1000 -d /home/HwHiAiUser -m HwHiAiUser -s /bin/bash
  • 添加目录权限
1
2
chown -R HwHiAiUser /usr/local/Ascend
chmod -R 755 /usr/local/Ascend
  • 下载驱动、固件

前往 https://www.hiascend.ru/hardware/firmware-drivers/community?product=1&model=30&cann=All&driver=1.0.26.alpha 找到对应的驱动和固件。

1
wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Ascend%20HDK/Ascend%20HDK%2024.1.RC2.2/Ascend-hdk-910b-npu-driver_24.1.rc2.2_linux-x86-64.run
1
wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Ascend%20HDK/Ascend%20HDK%2024.1.RC2.2/Ascend-hdk-910b-npu-firmware_7.3.0.2.220.run
  • 安装驱动
1
bash ./Ascend-hdk-910b-npu-driver_24.1.rc2.2_linux-x86-64.run --full --install-for-all
  • 安装固件
1
bash ./Ascend-hdk-910b-npu-firmware_7.3.0.2.220.run --full

2. 安装 docker runtime

2.1 安装 docker

  • 添加 key
1
2
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - &&
curl -fsSL https://mirrors.aliyun.com/docker-ce/linux/ubuntu/gpg | sudo apt-key add -
  • 添加源
1
2
3
4
cat >> /etc/apt/sources.list.d/docker.list << EOF
deb [arch=amd64] https://mirrors.aliyun.com/docker-ce/linux/ubuntu xenial stable
deb [arch=amd64] https://download.docker.com/linux/ubuntu xenial stable
EOF
  • 更新源
1
apt-get update
  • 安装 docker
1
apt-get install docker-ce=5:20.10.7~3-0~ubuntu-xenial docker-ce-cli=5:20.10.7~3-0~ubuntu-xenial containerd.io -y

2.2 安装 ascend-docker-runtime

  • 下载 ascend-docker-runtime

前往 https://gitee.com/ascend/ascend-docker-runtime/releases/tag/v5.0.0-RC3.2 找到对应架构的下载链接。

1
wget https://gitee.com/ascend/ascend-docker-runtime/releases/download/v5.0.0-RC3.2/Ascend-docker-runtime_5.0.RC3.2_linux-x86_64.run
  • 安装 ascend-docker-runtime
1
bash ./Ascend-docker-runtime_5.0.RC3.2_linux-x86_64.run --install
  • 配置 docker
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
cat <<EOF >  /etc/docker/daemon.json
{
  "bip": "x.x.x.x/24",
  "live-restore": true,
  "data-root": "/data/docker",
  "default-runtime": "ascend",
  "insecure-registries": [
  ],
  "registry-mirrors": [
  ],
  "runtimes": {
    "ascend": {
      "path": "/usr/local/Ascend/Ascend-Docker-Runtime/ascend-docker-runtime",
      "runtimeArgs": []
    }
  }
}
EOF
  • 重启 docker
1
2
systemctl daemon-reload
systemctl restart docker

2.3 验证

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
docker run --rm -it --ipc=host \
                    --name all-in-one-test \
                    --device=/dev/davinci0 \
                    --device=/dev/davinci1 \
                    --device=/dev/davinci_manager \
                    --device=/dev/devmm_svm \
                    --device=/dev/hisi_hdc \
                    -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
                    -v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ \
                    -v /usr/local/sbin/:/usr/local/sbin/ \
                    -v /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi \
                    -v /var/log/npu/conf/slog/slog.conf:/var/log/npu/conf/slog/slog.conf \
                    -v /var/log/npu/slog/:/var/log/npu/slog \
                    -v /var/log/npu/profiling/:/var/log/npu/profiling \
                    -v /var/log/npu/dump/:/var/log/npu/dump \
                    -v /var/log/npu/:/usr/slog \
                    registry.cn-hangzhou.aliyuncs.com/opshub/hccl-test:8.0.RC2-ubuntu22.04 \
                    npu-smi info

4. 安装 cri-docker

  • 安装 CRI-Docker
1
2
3
4
wget https://github.com/Mirantis/cri-dockerd/releases/download/v0.3.10/cri-dockerd-0.3.10.amd64.tgz
tar -zxvf cri-dockerd-0.3.10.amd64.tgz
cp cri-dockerd/cri-dockerd /usr/local/bin/cri-dockerd
chmod +x /usr/local/bin/cri-dockerd
  • 添加用户组
1
groupadd docker
  • 配置启动文件
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
cat <<EOF > /etc/systemd/system/cri-docker.service
[Unit]
Description=CRI Interface for Docker Application Container Engine
Documentation=https://docs.mirantis.com
After=network-online.target firewalld.service docker.service
Wants=network-online.target
Requires=cri-docker.socket

[Service]
Type=notify

ExecStart=/usr/local/bin/cri-dockerd --network-plugin=cni --pod-infra-container-image=registry.aliyuncs.com/google_containers/pause:3.9

ExecReload=/bin/kill -s HUP $MAINPID
TimeoutSec=0
RestartSec=2
Restart=always

StartLimitBurst=3

StartLimitInterval=60s

LimitNOFILE=infinity
LimitNPROC=infinity
LimitCORE=infinity

TasksMax=infinity
Delegate=yes
KillMode=process

[Install]
WantedBy=multi-user.target
EOF
  • 生成 socket 文件
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
cat <<EOF > /etc/systemd/system/cri-docker.socket
[Unit]
Description=CRI Docker Socket for the API
PartOf=cri-docker.service

[Socket]
ListenStream=%t/cri-dockerd.sock
SocketMode=0660
SocketUser=root
SocketGroup=docker

[Install]
WantedBy=sockets.target
EOF
  • 启动 CRI-DOCKER
1
2
3
4
systemctl daemon-reload
systemctl start cri-docker
systemctl enable cri-docker
systemctl is-active cri-docker

5. 加入 K8s 集群

5.1 修改 Hostname

1
2
export HOSTNAME=k8s-ascend-910b-27
hostnamectl set-hostname ${HOSTNAME}

创建设备管理插件的工作目录

1
mkdir -p /var/log/mindx-dl/devicePlugin

5.2 初始化内核参数

1
opscli task -f ~/.ops/task/set-host.yaml

5.3 安装 K8s 基础组件

  • 添加 K8s 源

https://developer.aliyun.com/mirror/kubernetes/ 1.28 以下版本添加

1
curl https://mirrors.aliyun.com/kubernetes/apt/doc/apt-key.gpg | apt-key add -
1
2
3
cat <<EOF >/etc/apt/sources.list.d/kubernetes.list
deb https://mirrors.aliyun.com/kubernetes/apt/ kubernetes-xenial main
EOF
1
apt-get update
  • 安装 K8s 基础组件
1
export K8S_VERSION=1.25.6
1
apt-get install kubeadm=${K8S_VERSION}-00 kubelet=${K8S_VERSION}-00 kubectl=${K8S_VERSION}-00 -y

5.4 加入集群

  • 生成 Token

在 master 节点生成 token

1
kubeadm token create --print-join-command
  • 加入集群

如果 worker 节点是 Docker Runtime,在加入集群时,需要加上参数 --cri-socket unix:///var/run/cri-dockerd.sock

1
kubeadm join x.x.x.x:6443 --token x.x --discovery-token-ca-cert-hash sha256:x --cri-socket unix:///var/run/cri-dockerd.sock

5.5 创建测试的 Pod

  • 创建 Pod
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
kubectl apply -f - <<EOF
apiVersion: v1
kind: Pod
metadata:
  name: npu-demo
spec:
  nodeName: ${HOSTNAME}
  containers:
    - name: npu-demo
      image: registry.cn-hangzhou.aliyuncs.com/opshub/hccl-test:8.0.RC2-ubuntu22.04
      command: ["npu-smi", "info"]
      resources:
        requests:
          huawei.com/Ascend910: 4
        limits:
          huawei.com/Ascend910: 4
EOF
  • 查看 Pod 状态
1
kubectl logs npu-demo
  • 删除 Pod
1
kubectl delete pod npu-demo

6. NPU 状态检测

  • 禁用 TLS
1
for i in {0..15};do hccn_tool -i $i -tls -g| grep 'switch';done
  • IP
1
for i in {0..15};do hccn_tool  -i $i  -ip -g;done
  • 网关
1
for i in {0..15};do hccn_tool  -i $i  -gateway -g;done
  • 健康状态
1
for i in {0..15};do npu-smi info -t health -i $i -c 0| grep 'Health Status';done
  • link 状态
1
for i in {0..15}; do hccn_tool -i $i -link -g;done
  • 网卡状态
1
for i in {0..15};do hccn_tool -i $i -net_health -g;done
  • ECC
1
for i in {0..15}; do npu-smi info -t ecc -i $i;done

微信公众号
作者
微信公众号