1. 安装驱动
1
2
| groupadd -g 1000 HwHiAiUser
useradd -g HwHiAiUser -u 1000 -d /home/HwHiAiUser -m HwHiAiUser -s /bin/bash
|
1
2
| chown -R HwHiAiUser /usr/local/Ascend
chmod -R 755 /usr/local/Ascend
|
前往 https://www.hiascend.ru/hardware/firmware-drivers/community?product=1&model=30&cann=All&driver=1.0.26.alpha 找到对应的驱动和固件。
1
| wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Ascend%20HDK/Ascend%20HDK%2024.1.RC2.2/Ascend-hdk-910b-npu-driver_24.1.rc2.2_linux-x86-64.run
|
1
| wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Ascend%20HDK/Ascend%20HDK%2024.1.RC2.2/Ascend-hdk-910b-npu-firmware_7.3.0.2.220.run
|
1
| bash ./Ascend-hdk-910b-npu-driver_24.1.rc2.2_linux-x86-64.run --full --install-for-all
|
1
| bash ./Ascend-hdk-910b-npu-firmware_7.3.0.2.220.run --full
|
2. 安装 docker runtime
2.1 安装 docker
1
2
| curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - &&
curl -fsSL https://mirrors.aliyun.com/docker-ce/linux/ubuntu/gpg | sudo apt-key add -
|
1
2
3
4
| cat >> /etc/apt/sources.list.d/docker.list << EOF
deb [arch=amd64] https://mirrors.aliyun.com/docker-ce/linux/ubuntu xenial stable
deb [arch=amd64] https://download.docker.com/linux/ubuntu xenial stable
EOF
|
1
| apt-get install docker-ce=5:20.10.7~3-0~ubuntu-xenial docker-ce-cli=5:20.10.7~3-0~ubuntu-xenial containerd.io -y
|
2.2 安装 ascend-docker-runtime
前往 https://gitee.com/ascend/ascend-docker-runtime/releases/tag/v5.0.0-RC3.2 找到对应架构的下载链接。
1
| wget https://gitee.com/ascend/ascend-docker-runtime/releases/download/v5.0.0-RC3.2/Ascend-docker-runtime_5.0.RC3.2_linux-x86_64.run
|
1
| bash ./Ascend-docker-runtime_5.0.RC3.2_linux-x86_64.run --install
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
| cat <<EOF > /etc/docker/daemon.json
{
"bip": "x.x.x.x/24",
"live-restore": true,
"data-root": "/data/docker",
"default-runtime": "ascend",
"insecure-registries": [
],
"registry-mirrors": [
],
"runtimes": {
"ascend": {
"path": "/usr/local/Ascend/Ascend-Docker-Runtime/ascend-docker-runtime",
"runtimeArgs": []
}
}
}
EOF
|
1
2
| systemctl daemon-reload
systemctl restart docker
|
2.3 验证
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
| docker run --rm -it --ipc=host \
--name all-in-one-test \
--device=/dev/davinci0 \
--device=/dev/davinci1 \
--device=/dev/davinci_manager \
--device=/dev/devmm_svm \
--device=/dev/hisi_hdc \
-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
-v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ \
-v /usr/local/sbin/:/usr/local/sbin/ \
-v /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi \
-v /var/log/npu/conf/slog/slog.conf:/var/log/npu/conf/slog/slog.conf \
-v /var/log/npu/slog/:/var/log/npu/slog \
-v /var/log/npu/profiling/:/var/log/npu/profiling \
-v /var/log/npu/dump/:/var/log/npu/dump \
-v /var/log/npu/:/usr/slog \
registry.cn-hangzhou.aliyuncs.com/opshub/hccl-test:8.0.RC2-ubuntu22.04 \
npu-smi info
|
4. 安装 cri-docker
1
2
3
4
| wget https://github.com/Mirantis/cri-dockerd/releases/download/v0.3.10/cri-dockerd-0.3.10.amd64.tgz
tar -zxvf cri-dockerd-0.3.10.amd64.tgz
cp cri-dockerd/cri-dockerd /usr/local/bin/cri-dockerd
chmod +x /usr/local/bin/cri-dockerd
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
| cat <<EOF > /etc/systemd/system/cri-docker.service
[Unit]
Description=CRI Interface for Docker Application Container Engine
Documentation=https://docs.mirantis.com
After=network-online.target firewalld.service docker.service
Wants=network-online.target
Requires=cri-docker.socket
[Service]
Type=notify
ExecStart=/usr/local/bin/cri-dockerd --network-plugin=cni --pod-infra-container-image=registry.aliyuncs.com/google_containers/pause:3.9
ExecReload=/bin/kill -s HUP $MAINPID
TimeoutSec=0
RestartSec=2
Restart=always
StartLimitBurst=3
StartLimitInterval=60s
LimitNOFILE=infinity
LimitNPROC=infinity
LimitCORE=infinity
TasksMax=infinity
Delegate=yes
KillMode=process
[Install]
WantedBy=multi-user.target
EOF
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
| cat <<EOF > /etc/systemd/system/cri-docker.socket
[Unit]
Description=CRI Docker Socket for the API
PartOf=cri-docker.service
[Socket]
ListenStream=%t/cri-dockerd.sock
SocketMode=0660
SocketUser=root
SocketGroup=docker
[Install]
WantedBy=sockets.target
EOF
|
1
2
3
4
| systemctl daemon-reload
systemctl start cri-docker
systemctl enable cri-docker
systemctl is-active cri-docker
|
5. 加入 K8s 集群
5.1 修改 Hostname
1
2
| export HOSTNAME=k8s-ascend-910b-27
hostnamectl set-hostname ${HOSTNAME}
|
创建设备管理插件的工作目录
1
| mkdir -p /var/log/mindx-dl/devicePlugin
|
5.2 初始化内核参数
1
| opscli task -f ~/.ops/task/set-host.yaml
|
5.3 安装 K8s 基础组件
https://developer.aliyun.com/mirror/kubernetes/ 1.28 以下版本添加
1
| curl https://mirrors.aliyun.com/kubernetes/apt/doc/apt-key.gpg | apt-key add -
|
1
2
3
| cat <<EOF >/etc/apt/sources.list.d/kubernetes.list
deb https://mirrors.aliyun.com/kubernetes/apt/ kubernetes-xenial main
EOF
|
1
| export K8S_VERSION=1.25.6
|
1
| apt-get install kubeadm=${K8S_VERSION}-00 kubelet=${K8S_VERSION}-00 kubectl=${K8S_VERSION}-00 -y
|
5.4 加入集群
在 master 节点生成 token
1
| kubeadm token create --print-join-command
|
如果 worker 节点是 Docker Runtime,在加入集群时,需要加上参数 --cri-socket unix:///var/run/cri-dockerd.sock
。
1
| kubeadm join x.x.x.x:6443 --token x.x --discovery-token-ca-cert-hash sha256:x --cri-socket unix:///var/run/cri-dockerd.sock
|
5.5 创建测试的 Pod
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
| kubectl apply -f - <<EOF
apiVersion: v1
kind: Pod
metadata:
name: npu-demo
spec:
nodeName: ${HOSTNAME}
containers:
- name: npu-demo
image: registry.cn-hangzhou.aliyuncs.com/opshub/hccl-test:8.0.RC2-ubuntu22.04
command: ["npu-smi", "info"]
resources:
requests:
huawei.com/Ascend910: 4
limits:
huawei.com/Ascend910: 4
EOF
|
1
| kubectl delete pod npu-demo
|
6. NPU 状态检测
1
| for i in {0..15};do hccn_tool -i $i -tls -g| grep 'switch';done
|
1
| for i in {0..15};do hccn_tool -i $i -ip -g;done
|
1
| for i in {0..15};do hccn_tool -i $i -gateway -g;done
|
1
| for i in {0..15};do npu-smi info -t health -i $i -c 0| grep 'Health Status';done
|
1
| for i in {0..15}; do hccn_tool -i $i -link -g;done
|
1
| for i in {0..15};do hccn_tool -i $i -net_health -g;done
|
1
| for i in {0..15}; do npu-smi info -t ecc -i $i;done
|