1. 安装驱动
1
2
| groupadd -g 1000 HwHiAiUser
useradd -g HwHiAiUser -u 1000 -d /home/HwHiAiUser -m HwHiAiUser -s /bin/bash
|
1
2
| chown -R HwHiAiUser /usr/local/Ascend
chmod -R 755 /usr/local/Ascend
|
前往 https://www.hiascend.ru/hardware/firmware-drivers/community?product=1&model=30&cann=All&driver=1.0.26.alpha 找到对应的驱动和固件。
1
| wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Ascend%20HDK/Ascend%20HDK%2024.1.RC2.2/Ascend-hdk-910b-npu-driver_24.1.rc2.2_linux-x86-64.run
|
1
| wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Ascend%20HDK/Ascend%20HDK%2024.1.RC2.2/Ascend-hdk-910b-npu-firmware_7.3.0.2.220.run
|
1
| bash ./Ascend-hdk-910b-npu-driver_24.1.rc2.2_linux-x86-64.run --full --install-for-all
|
1
| bash ./Ascend-hdk-910b-npu-firmware_7.3.0.2.220.run --full
|
2. 安装 ascend-docker-runtime
前往 https://gitee.com/ascend/ascend-docker-runtime/releases/tag/v5.0.0-RC3.2 找到对应架构的下载链接。
1
| wget https://gitee.com/ascend/ascend-docker-runtime/releases/download/v5.0.0-RC3.2/Ascend-docker-runtime_5.0.RC3.2_linux-x86_64.run
|
1
| bash ./Ascend-docker-runtime_5.0.RC3.2_linux-x86_64.run --install
|
3. 安装 Docker [可选]
Docker 和 Containerd 二选一。
3.1 安装 docker
1
2
| curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - &&
curl -fsSL https://mirrors.aliyun.com/docker-ce/linux/ubuntu/gpg | sudo apt-key add -
|
1
2
3
4
| cat >> /etc/apt/sources.list.d/docker.list << EOF
deb [arch=amd64] https://mirrors.aliyun.com/docker-ce/linux/ubuntu xenial stable
deb [arch=amd64] https://download.docker.com/linux/ubuntu xenial stable
EOF
|
1
| apt-get install docker-ce=5:20.10.7~3-0~ubuntu-xenial docker-ce-cli=5:20.10.7~3-0~ubuntu-xenial containerd.io -y
|
3.2 配置 docker
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
| cat <<EOF > /etc/docker/daemon.json
{
"bip": "x.x.x.x/24",
"live-restore": true,
"data-root": "/data/docker",
"default-runtime": "ascend",
"insecure-registries": [
],
"registry-mirrors": [
],
"runtimes": {
"ascend": {
"path": "/usr/local/Ascend/Ascend-Docker-Runtime/ascend-docker-runtime",
"runtimeArgs": []
}
}
}
EOF
|
1
2
| systemctl daemon-reload
systemctl restart docker
|
3.3 安装 cri-docker
用于 Kubelet 调用,如果使用的是 containerd,可以跳过这一步。
1
2
3
4
| wget https://github.com/Mirantis/cri-dockerd/releases/download/v0.3.10/cri-dockerd-0.3.10.amd64.tgz
tar -zxvf cri-dockerd-0.3.10.amd64.tgz
cp cri-dockerd/cri-dockerd /usr/local/bin/cri-dockerd
chmod +x /usr/local/bin/cri-dockerd
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
| cat <<EOF > /etc/systemd/system/cri-docker.service
[Unit]
Description=CRI Interface for Docker Application Container Engine
Documentation=https://docs.mirantis.com
After=network-online.target firewalld.service docker.service
Wants=network-online.target
Requires=cri-docker.socket
[Service]
Type=notify
ExecStart=/usr/local/bin/cri-dockerd --network-plugin=cni --pod-infra-container-image=registry.aliyuncs.com/google_containers/pause:3.9
ExecReload=/bin/kill -s HUP $MAINPID
TimeoutSec=0
RestartSec=2
Restart=always
StartLimitBurst=3
StartLimitInterval=60s
LimitNOFILE=infinity
LimitNPROC=infinity
LimitCORE=infinity
TasksMax=infinity
Delegate=yes
KillMode=process
[Install]
WantedBy=multi-user.target
EOF
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
| cat <<EOF > /etc/systemd/system/cri-docker.socket
[Unit]
Description=CRI Docker Socket for the API
PartOf=cri-docker.service
[Socket]
ListenStream=%t/cri-dockerd.sock
SocketMode=0660
SocketUser=root
SocketGroup=docker
[Install]
WantedBy=sockets.target
EOF
|
1
2
3
4
| systemctl daemon-reload
systemctl start cri-docker
systemctl enable cri-docker
systemctl is-active cri-docker
|
3.4 验证
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
| docker run --rm -it --ipc=host \
--device=/dev/davinci0 \
--device=/dev/davinci1 \
--device=/dev/davinci2 \
--device=/dev/davinci3 \
--device=/dev/davinci_manager \
--device=/dev/devmm_svm \
--device=/dev/hisi_hdc \
-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
-v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ \
-v /usr/local/sbin/:/usr/local/sbin/ \
-v /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi \
-v /var/log/npu/conf/slog/slog.conf:/var/log/npu/conf/slog/slog.conf \
-v /var/log/npu/slog/:/var/log/npu/slog \
-v /var/log/npu/profiling/:/var/log/npu/profiling \
-v /var/log/npu/dump/:/var/log/npu/dump \
-v /var/log/npu/:/usr/slog \
registry.cn-hangzhou.aliyuncs.com/opshub/hccl-test:8.0.RC2-ubuntu22.04 \
npu-smi info
|
4. 安装 Containerd [可选]
Docker 和 Containerd 二选一。
4.1 安装 containerd
1
2
| apt-get update
apt-get install -y ca-certificates curl gnupg lsb-release
|
1
2
| mkdir -p /etc/apt/keyrings
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg
|
1
2
3
| echo \
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \
$(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
|
1
2
| apt update
apt install containerd.io=1.6.31-1
|
4.2 配置 containerd
1
2
| mkdir -p /etc/containerd
containerd config default > /etc/containerd/config.toml
|
1
2
3
4
| sed -i 's#root = "/var/lib/containerd"#root = "/data/containerd"#g' /etc/containerd/config.toml
sed -i 's#state = "/run/containerd"#state = "/data/run/containerd"#g' /etc/containerd/config.toml
sed -i 's#sandbox_image = "registry.k8s.io/pause:3.6"#sandbox_image = "registry.aliyuncs.com/google_containers/pause:3.9"#g' /etc/containerd/config.toml
sed -i 's#SystemdCgroup = false#SystemdCgroup = true#g' /etc/containerd/config.toml
|
1
| sed -i 's#runtime = "runc"#runtime = "/usr/local/Ascend/Ascend-Docker-Runtime/ascend-docker-runtime"#g' /etc/containerd/config.toml
|
1
| systemctl restart containerd
|
4.3 验证
1
| opscli task -f ~/.ops/tasks/install-nerdctl.yaml
|
ARM 镜像
1
| export TEST_IMAGE=registry.cn-hangzhou.aliyuncs.com/opshub/python38-cann:8.0rc3-arm64
|
AMD64 镜像
1
| export TEST_IMAGE=registry.cn-hangzhou.aliyuncs.com/opshub/python38-cann:8.0rc2-amd64
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
| nerdctl -n k8s.io run --rm -it --ipc=host \
--device=/dev/davinci0 \
--device=/dev/davinci1 \
--device=/dev/davinci2 \
--device=/dev/davinci3 \
--device=/dev/davinci_manager \
--device=/dev/devmm_svm \
--device=/dev/hisi_hdc \
-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
-v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ \
-v /usr/local/sbin/:/usr/local/sbin/ \
-v /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi \
-v /var/log/npu/conf/slog/slog.conf:/var/log/npu/conf/slog/slog.conf \
-v /var/log/npu/slog/:/var/log/npu/slog \
-v /var/log/npu/profiling/:/var/log/npu/profiling \
-v /var/log/npu/dump/:/var/log/npu/dump \
-v /var/log/npu/:/usr/slog \
${TEST_IMAGE} \
npu-smi info
|
4. 加入 K8s 集群
4.1 修改 Hostname
1
2
| export HOSTNAME=k8s-ascend-910b-27
hostnamectl set-hostname ${HOSTNAME}
|
创建设备管理插件的工作目录
1
| mkdir -p /var/log/mindx-dl/devicePlugin
|
4.2 初始化内核参数
1
| opscli task -f ~/.ops/task/set-host.yaml
|
4.3 安装 K8s 基础组件
https://developer.aliyun.com/mirror/kubernetes/ 1.28 以下版本添加
1
| curl https://mirrors.aliyun.com/kubernetes/apt/doc/apt-key.gpg | apt-key add -
|
1
2
3
| cat <<EOF >/etc/apt/sources.list.d/kubernetes.list
deb https://mirrors.aliyun.com/kubernetes/apt/ kubernetes-xenial main
EOF
|
1
| export K8S_VERSION=1.25.6
|
1
| apt-get install kubeadm=${K8S_VERSION}-00 kubelet=${K8S_VERSION}-00 kubectl=${K8S_VERSION}-00 -y
|
4.4 加入集群
在 master 节点生成 token
1
| kubeadm token create --print-join-command
|
如果 worker 节点是 Docker Runtime,在加入集群时,需要加上参数 --cri-socket unix:///var/run/cri-dockerd.sock
。
1
| kubeadm join x.x.x.x:6443 --token x.x --discovery-token-ca-cert-hash sha256:x --cri-socket unix:///var/run/cri-dockerd.sock
|
4.5 创建测试的 Pod
1
| export TEST_IMAGE=registry.cn-hangzhou.aliyuncs.com/opshub/python38-cann:8.0rc3-arm64
|
AMD64 镜像
1
| export TEST_IMAGE=registry.cn-hangzhou.aliyuncs.com/opshub/python38-cann:8.0rc2-amd64
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
| kubectl apply -f - <<EOF
apiVersion: v1
kind: Pod
metadata:
name: npu-demo
spec:
nodeName: ${HOSTNAME}
containers:
- name: npu-demo
image: ${TEST_IMAGE}
command: ["npu-smi", "info"]
resources:
requests:
huawei.com/Ascend910: 4
limits:
huawei.com/Ascend910: 4
EOF
|
1
| kubectl delete pod npu-demo
|
5. NPU 状态检测
1
| for i in {0..15};do hccn_tool -i $i -tls -g| grep 'switch';done
|
1
| for i in {0..15};do hccn_tool -i $i -ip -g;done
|
1
| for i in {0..15};do hccn_tool -i $i -gateway -g;done
|
1
| for i in {0..15};do npu-smi info -t health -i $i -c 0| grep 'Health Status';done
|
1
| for i in {0..15}; do hccn_tool -i $i -link -g;done
|
1
| for i in {0..15};do hccn_tool -i $i -net_health -g;done
|
1
| for i in {0..15}; do npu-smi info -t ecc -i $i;done
|
6. 参考