From 3f30ec94de721a4ee591f6673f88864c8ff58d27 Mon Sep 17 00:00:00 2001 From: lining Date: Tue, 23 Nov 2021 10:37:06 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E3=80=90=E9=9C=80=E6=B1=82=E6=8F=8F?= =?UTF-8?q?=E8=BF=B0=E3=80=911015=E7=A4=BE=E5=8C=BA=E7=89=88deviceplugin?= =?UTF-8?q?=E5=90=88=E5=85=A5=20=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E3=80=911=E3=80=81=E6=97=A0=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E6=96=B0=E5=A2=9E=E5=8A=9F=E8=83=BD=EF=BC=9B2=E3=80=81?= =?UTF-8?q?=E5=8D=95=E5=85=83=E6=B5=8B=E8=AF=95bugfix=20=E3=80=90=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E4=BA=BA=E3=80=91lining?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.EN.md | 10 +- README.md | 489 ---------------------- README.zh.md | 10 +- build/test.sh | 22 +- ci/dependency.xml | 15 - src/plugin/pkg/npu/huawei/manager_test.go | 4 + src/plugin/pkg/npu/huawei/server_test.go | 5 +- 7 files changed, 38 insertions(+), 517 deletions(-) delete mode 100644 README.md diff --git a/README.EN.md b/README.EN.md index f9a75843..8a54becf 100644 --- a/README.EN.md +++ b/README.EN.md @@ -544,7 +544,15 @@ The device management plugin provides the following functions: -

v2.0.2

+ +

v2.0.3

+ +

2021-10-15

+ + + + +

v2.0.2

2021-07-15

diff --git a/README.md b/README.md deleted file mode 100644 index 239495fb..00000000 --- a/README.md +++ /dev/null @@ -1,489 +0,0 @@ -# Ascend Device Plugin.en -- [Ascend Device Plugin](#ascend-device-plugin.md) - - [Description](#description.md) - - [Compiling the Ascend Device Plugin](#compiling-the-ascend-device-plugin.md) - - [Creating DaemonSet](#creating-daemonset.md) - - [Creating a Service Container](#creating-a-service-container.md) -- [Environment Dependencies](#environment-dependencies.md) -- [Directory Structure](#directory-structure.md) -- [Version Updates](#version-updates.md) -

Ascend Device Plugin

- -- **[Description](#description.md)** - -- **[Compiling the Ascend Device Plugin](#compiling-the-ascend-device-plugin.md)** - -- **[Creating DaemonSet](#creating-daemonset.md)** - -- **[Creating a Service Container](#creating-a-service-container.md)** - - -

Description

- -The device management plug-in provides the following functions: - -- Device discovery: The number of discovered devices can be obtained from the Ascend device driver and reported to the Kubernetes system. -- Health check: The health status of Ascend devices can be detected. When a device is unhealthy, the device is reported to the Kubernetes system and is removed. -- Device allocation: Ascend devices can be allocated in the Kubernetes system. - -

Compiling the Ascend Device Plugin

- -## Procedure - -1. Set environment variables. - - **export GO111MODULE=on** - - **export GOPROXY=**_Proxy address_ - - **export GONOSUMDB=\*** - - >![](figures/icon-note.gif) **NOTE:** - >- Use the actual GOPROXY proxy address. You can run the **go mod download** command in the **ascend-device-plugin** directory to check the address. - >- If no error information is displayed, the proxy is set successfully. - -2. Go to the **ascend-device-plugin** directory and run the following command to modify the YAML file: - - Common YAML file - - **vi ascendplugin.yaml** - - ``` - apiVersion: apps/v1 - kind: DaemonSet - metadata: - name: ascend-device-plugin-daemonset - namespace: kube-system - spec: - selector: - matchLabels: - name: ascend-device-plugin-ds - updateStrategy: - type: RollingUpdate - template: - metadata: - annotations: - scheduler.alpha.kubernetes.io/critical-pod: "" - labels: - name: ascend-device-plugin-ds - spec: - tolerations: - - key: CriticalAddonsOnly - operator: Exists - - key: huawei.com/Ascend910 #Resource name. Set the value based on the processor type. - operator: Exists - effect: NoSchedule - - key: "ascendplugin" - operator: "Equal" - value: "v2" - effect: NoSchedule - priorityClassName: "system-node-critical" - nodeSelector: - accelerator: huawei-Ascend910 #Set the label name based on the processor type. - containers: - - image: ascend-device-plugin:v1.0.1 #Image name and version - name: device-plugin-01 - resources: - requests: - memory: 500Mi - cpu: 500m - limits: - memory: 500Mi - cpu: 500m - command: [ "/bin/bash", "-c", "--"] - args: [ "ascendplugin --useAscendDocker=${USE_ASCEND_DOCKER}" ] - securityContext: - privileged: true - imagePullPolicy: Never - volumeMounts: - - name: device-plugin - mountPath: /var/lib/kubelet/device-plugins - - name: hiai-driver - mountPath: /usr/local/Ascend/driver #Set the value to the actual driver installation directory. - - name: log-path - mountPath: /var/log/devicePlugin - volumes: - - name: device-plugin - hostPath: - path: /var/lib/kubelet/device-plugins - - name: hiai-driver - hostPath: - path: /usr/local/Ascend/driver #Set the value to the actual driver installation directory. - - name: log-path - hostPath: - path: /var/log/devicePlugin - - ``` - - - YAML file of MindX DL - - **ascendplugin-volcano.yaml** - - ``` - kind: ClusterRoleBinding - apiVersion: rbac.authorization.k8s.io/v1 - metadata: - name: pods-device-plugin - subjects: - - kind: ServiceAccount - name: default - namespace: kube-system - roleRef: - kind: ClusterRole - name: cluster-admin - apiGroup: rbac.authorization.k8s.io - --- - apiVersion: apps/v1 - kind: DaemonSet - metadata: - name: ascend-device-plugin-daemonset - namespace: kube-system - spec: - selector: - matchLabels: - name: ascend-device-plugin-ds - updateStrategy: - type: RollingUpdate - template: - metadata: - annotations: - scheduler.alpha.kubernetes.io/critical-pod: "" - labels: - name: ascend-device-plugin-ds - spec: - tolerations: - - key: CriticalAddonsOnly - operator: Exists - - key: huawei.com/Ascend910 - operator: Exists - effect: NoSchedule - - key: "ascendplugin" - operator: "Equal" - value: "v2" - effect: NoSchedule - priorityClassName: "system-node-critical" - nodeSelector: - accelerator: huawei-Ascend910 - containers: - - image: ascend-k8sdeviceplugin:v0.0.1 #Image name and version - name: device-plugin-01 - resources: - requests: - memory: 500Mi - cpu: 500m - limits: - memory: 500Mi - cpu: 500m - command: [ "/bin/bash", "-c", "--"] - args: [ "ascendplugin --useAscendDocker=${USE_ASCEND_DOCKER} --volcanoType=true" ] - securityContext: - privileged: true - imagePullPolicy: Never - volumeMounts: - - name: device-plugin - mountPath: /var/lib/kubelet/device-plugins - - name: hiai-driver - mountPath: /usr/local/Ascend/driver #Set the value to the actual driver installation directory. - - name: log-path - mountPath: /var/log/devicePlugin - env: - - name: NODE_NAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - volumes: - - name: device-plugin - hostPath: - path: /var/lib/kubelet/device-plugins - - name: hiai-driver - hostPath: - path: /usr/local/Ascend/driver #Set the value to the actual driver installation directory. - - name: log-path - hostPath: - path: /var/log/devicePlugin - - ``` - - -3. Run the following command to edit the **Dockerfile** file and change the image name and version to the obtained values: - - **vi** _/home/test/_ascend-device-plugin**/Dockerfile** - - ``` - #Select the basic image as required. You can run the docker images command to query the basic image. - FROM ubuntu:18.04 as build - #Specify whether to use Ascend Docker. The default value is true. Change it to false. - ENV USE_ASCEND_DOCKER true - - ENV LD_LIBRARY_PATH /usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common - - ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:/usr/local/Ascend/driver/lib64/ - - COPY ./output/ascendplugin /usr/local/bin/ - - ``` - -4. Run the following commands to generate a binary file and image file \(use the actual script name\): - - **cd** _/home/test/_ascend-device-plugin**/build**/ - - **chmod +x build.sh** - - **dos2unix build.sh** - - **./build.sh dockerimages** - -5. Run the following command to view the generated software package: - - **ll** _/home/test/_ascend-device-plugin**/output** - - The software package name for the x86 environment and that for the ARM environment are different. The following uses the ARM environment as an example. - - >![](figures/icon-note.gif) **NOTE:** - >- **Ascend-K8sDevicePlugin-**_xxx_**-arm64-Docker.tar.gz**: K8s device plugin image. - >- **Ascend-K8sDevicePlugin-**_xxx_**-arm64-Linux.tar.gz**: binary installation package of the K8s device plugin. - - ``` - drwxr-xr-x 2 root root 4096 Jun 8 18:42 ./ - drwxr-xr-x 9 root root 4096 Jun 8 17:12 ../ - -rw-r--r-- 1 root root 29584705 Jun 9 10:37 Ascend-K8sDevicePlugin-xxx-arm64-Docker.tar.gz - -rw-r--r-- 1 root root 6721073 Jun 9 16:20 Ascend-K8sDevicePlugin-xxx-arm64-Linux.tar.gz - ``` - - -

Creating DaemonSet

- -## Procedure - ->![](figures/icon-note.gif) **NOTE:** ->The following uses the tar.gz file generated on the ARM platform as an example. - -1. Run the following command to check whether the Docker software package is successfully imported: - - **docker images** - - - If yes, go to [3](#en-us_topic_0269670254_li26268471380). - - If no, perform [2](#en-us_topic_0269670254_li1372334715567) to import the file again. - -2. Go to the directory where the Docker software package is stored and run the following command to import the Docker image. - - **cd** _/home/test/_**ascend-device-plugin/output** - - **docker load** **-i** _Ascend-K8sDevicePlugin-xxx-arm64-Docker.tar.gz_ - -3. Run the following command to label the node with Ascend 910 or Ascend 310: - - **kubectl label nodes** _localhost.localdomain_ **accelerator=**_huawei-Ascend910_ - - **localhost.localdomain** is the name of the node with Ascend 910 or Ascend 310. You can run the **kubectl get node** command to view the node name. - - The label name must be the same as the **nodeSelector** label name in the YAML file in "Compiling the Ascend Device Plugin." - - >![](figures/icon-note.gif) **NOTE:** - >If the K8s plugin needs to be deployed on a new node, perform [2](#en-us_topic_0269670254_li1372334715567) to [3](#en-us_topic_0269670254_li26268471380). - -4. Run the following commands to deploy DaemonSet: - - **cd** _/home/test/_**ascend-device-plugin** - - **kubectl apply -f ascendplugin.yaml** - - >![](figures/icon-note.gif) **NOTE:** - >To view the node deployment information, you need to wait for several minutes after the deployment is complete. - -5. Run the following command to view the node device deployment information: - - **kubectl describe node** - - If the label and number of nodes are correct, the deployment is successful, as shown in the following figure. - - ``` - Capacity: - cpu: 128 - ephemeral-storage: 3842380928Ki - huawei.com/Ascend910: 8 - hugepages-2Mi: 0 - memory: 263865068Ki - pods: 110 - Allocatable: - cpu: 128 - ephemeral-storage: 3541138257382 - huawei.com/Ascend910: 8 - hugepages-2Mi: 0 - memory: 263762668Ki - pods: 110 - ``` - - -

Creating a Service Container

- -## Procedure - -1. Go to the **ascend-device-plugin** directory and run the following command to edit the pod configuration file: - - **cd** _/home/test/_**ascend-device-plugin** - - **vi ascend.yaml** - - ``` - apiVersion: v1 #Specifies the API version. This value must be included in kubectl apiversion. - kind: Pod #Role or type of the resource to be created - metadata: - name: rest502 #Pod name, which must be unique in the same namespace. - spec: #Detailed definition of a container in a pod. - containers: #Containers in the pod. - - name: rest502 #Container name in the pod. - image: centos_arm64_resnet50:7.8 #Address of the inference or training service image used by the container in the pod. - imagePullPolicy: Never - resources: - limits: #Resource limits - huawei.com/Ascend310: 2 #Change the value based on the actual resource type. - volumeMounts: - - name: joblog - mountPath: /home/log/ #Path of the container internal log. Change the value based on the task requirements. - - name: model - mountPath: /home/app/model #Container internal model path. Change the value based on the task requirements. - - name: slog-path - mountPath: /var/log/npu/conf/slog/slog.conf - - name: ascend-driver-path - mountPath: /usr/local/Ascend/driver #Change the value based on the actual driver path. - volumes: - - name: joblog - hostPath: - path: /home/test/docker_log #Log path mounted to the host. Change the value based on the task requirements. - - name: model - hostPath: - path: /home/test/docker_model/ #Model path mounted to the host. Change the value based on the task requirements. - - name: slog-path - hostPath: - path: /var/log/npu/conf/slog/slog.conf - - name: ascend-driver-path - hostPath: - path: /usr/local/Ascend/driver #Change the value based on the actual driver path. - ``` - -2. Run the following command to create a pod: - - **kubectl apply -f ascend.yaml** - - >![](figures/icon-note.gif) **NOTE:** - >To delete the pod, run the following command: - >**kubectl delete -f** **ascend.yaml** - -3. Run the following commands to access the pod and view the allocation information: - - **kubectl exec -it** _Pod name_ **bash** - - The pod name is the one configured in [1](#en-us_topic_0269670251_en-us_topic_0249483204_li104071617503). - - **ls /dev/** - - In the command output similar to the following, **davinci3** and **davinci4** are the allocated pods. - - ``` - core davinci3 davinci4 davinci_manager devmm_svm fd full hisi_hdc mqueue null ptmx - ``` - - -

Environment Dependencies

- -**Table 1** Environment Dependencies - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Check Item

-

Requirement

-

dos2unix

-

Run the dos2unix --version command to check that the software has been installed. There is no requirement on the version.

-

Driver version of the RUN package

-

Go to the directory of the driver (for example, /usr/local/Ascend/driver) and run the cat version.info command to confirm that the driver version is 1.73 or later.

-

Go language environment

-

Run the go version command to confirm that the version is 1.14.3 or later.

-

gcc version

-

Run the gcc --version command to confirm that the version is 7.3.0 or later.

-

Kubernetes version

-

1.17.x. Select the latest bugfix version.

-

You can run the kubectl version command to view the version.

-

Docker environment

-

Run the docker info command to confirm that Docker has been installed.

-

root user permission

-

Check that the root user permission of the BMS is available.

-
- -

Directory Structure

- -``` -├── build # Compilation scripts -│ └── build.sh -├── output # Compilation result directory. -├── src # Source code directory. -│ └── plugin -│ │ ├── cmd/ascendplugin -│ │ │ └── ascend_plugin.go -│ │ └── pkg/npu/huawei -├── test # Test directory. -├── Dockerfile # Image file. -├── LICENSE -├── Open Source Software Notice.md -├── README.zh.md -├── ascend.yaml # YAML file of the sample running task -├── ascendplugin-310.yaml # YAML file for deploying the inference card -├── ascendplugin-volcano.yaml # YAML file for implementing affinity scheduling and deployment with Volcano. -├──ascendplugin.yaml # YAML file for deploying the inference card -├── docker_run.sh # Docker running command -├── go.mod -└── go.sum -``` - -

Version Updates

- - - - - - - - - - - - - - - - -

Version

-

Date

-

Description

-

v2.0.1

-

2021-01-08

-

Optimized the description in "Creating DaemonSet."

-

v2.0.1

-

2020-11-18

-

This is the first official release.

-
- diff --git a/README.zh.md b/README.zh.md index e08ba4be..994130dd 100644 --- a/README.zh.md +++ b/README.zh.md @@ -543,7 +543,15 @@ -

v2.0.2

+ +

v2.0.3

+ +

2021-10-15

+ + + + +

v2.0.2

2021-07-15

diff --git a/build/test.sh b/build/test.sh index b537164c..538881fb 100644 --- a/build/test.sh +++ b/build/test.sh @@ -25,18 +25,18 @@ mockgen k8s.io/client-go/kubernetes/typed/core/v1 PodInterface >${MOCK_TOP}/mock export PKG_CONFIG_PATH=${TOP_DIR}/src/plugin/config/config_310/:$PKG_CONFIG_PATH function execute_test() { - if ! (go test -v -race -coverprofile cov.out ${TOP_DIR}/src/plugin/pkg/npu/huawei/ >./$file_input); then - echo '****** go test cases error! ******' - echo 'Failed' >$file_input - exit 1 - else - echo ${file_detail_output} - gocov convert cov.out | gocov-html >${file_detail_output} - gotestsum --junitfile unit-gotestsum.xml "${TOP_DIR}"/src/plugin/pkg/npu/huawei/... - fi + if ! (go test -v -race -coverprofile cov.out ${TOP_DIR}/src/plugin/pkg/npu/huawei/ >./$file_input); then + echo '****** go test cases error! ******' + echo 'Failed' >$file_input + exit 1 + else + echo ${file_detail_output} + gocov convert cov.out | gocov-html >${file_detail_output} + gotestsum --junitfile unit-tests.xml "${TOP_DIR}"/src/plugin/pkg/npu/huawei/... + fi } -file_input='testDeviceplugin.txt' +file_input='testDevicePlugin.txt' file_detail_output='api.html' echo "************************************* Start LLT Test *************************************" @@ -65,3 +65,5 @@ rm -rf ${MOCK_TOP}/mock_v1 rm -rf ${MOCK_TOP}/mock_kubernetes rm -rf ${MOCK_TOP}/mock_kubelet_v1beta1 exit 0 + +} \ No newline at end of file diff --git a/ci/dependency.xml b/ci/dependency.xml index dc3234ed..cbf11595 100644 --- a/ci/dependency.xml +++ b/ci/dependency.xml @@ -17,20 +17,5 @@ - - - BVersion - Generic - - Tuscany - Tuscany V100R001C75B020 - - - - Alpha/Inference/InstallPackages/Ascend310-driver-*.run - platform/Tuscany - - - \ No newline at end of file diff --git a/src/plugin/pkg/npu/huawei/manager_test.go b/src/plugin/pkg/npu/huawei/manager_test.go index 04b698c1..7ae9a703 100644 --- a/src/plugin/pkg/npu/huawei/manager_test.go +++ b/src/plugin/pkg/npu/huawei/manager_test.go @@ -60,6 +60,10 @@ func createFakeDevManager(runMode string) *HwDevManager { // TestHwDevManager_Serve for serve func TestHwDevManager_Serve(t *testing.T) { fakeHwDevManager := createFakeDevManager("") + errDir := os.MkdirAll("/var/lib/kubelet/device-plugins/",os.ModePerm) + if errDir != nil { + t.Fatal("TestHwDevManager_Serve Run FAiled, reason is failed to create folder file") + } f, err := os.Create(serverSock310) if err != nil { t.Fatal("TestHwDevManager_Serve Run FAiled, reason is failed to create sock file") diff --git a/src/plugin/pkg/npu/huawei/server_test.go b/src/plugin/pkg/npu/huawei/server_test.go index 22d07182..3a08879e 100644 --- a/src/plugin/pkg/npu/huawei/server_test.go +++ b/src/plugin/pkg/npu/huawei/server_test.go @@ -23,6 +23,7 @@ import ( "google.golang.org/grpc" "k8s.io/apimachinery/pkg/util/sets" pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" + "os" "testing" ) @@ -61,7 +62,9 @@ func TestStart(t *testing.T) { pluginSocketPath := "/var/lib/kubelet/device-plugins/" + pluginSocket hps := NewHwPluginServe(fakeHwDevManager, "Ascend910", pluginSocketPath) err := hps.Start(pluginSocket, pluginSocketPath) - if err != nil { + kubeSocketPath := "/var/lib/kubelet/device-plugins/kubelet.sock" + _, kubeErr := os.Stat(kubeSocketPath) + if err != nil && kubeErr != nil && os.IsExist(kubeErr) { t.Fatal("TestStart Run Failed") } t.Logf("TestStart Run Pass") -- Gitee From b20741112b323af3071e8473ebb28965473fea85 Mon Sep 17 00:00:00 2001 From: lining Date: Tue, 23 Nov 2021 11:19:16 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E3=80=90=E9=9C=80=E6=B1=82=E6=8F=8F?= =?UTF-8?q?=E8=BF=B0=E3=80=911015=E7=A4=BE=E5=8C=BA=E7=89=88deviceplugin?= =?UTF-8?q?=E5=90=88=E5=85=A5=20=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=E3=80=911=E3=80=81=E6=97=A0=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E6=96=B0=E5=A2=9E=E5=8A=9F=E8=83=BD=EF=BC=9B2=E3=80=81?= =?UTF-8?q?=E5=8D=95=E5=85=83=E6=B5=8B=E8=AF=95bugfix=20=E3=80=90=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E4=BA=BA=E3=80=91lining?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ci/dependency.xml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/ci/dependency.xml b/ci/dependency.xml index cbf11595..e9d6083a 100644 --- a/ci/dependency.xml +++ b/ci/dependency.xml @@ -17,5 +17,20 @@ + + + BVersion + Generic + + Tuscany + Tuscany V100R001C75B020 + + + + Alpha/Inference/InstallPackages/Ascend310-driver-*.run + platform/Tuscany + + + \ No newline at end of file -- Gitee