From c9a11e102418766b0cff4f2a09017f56c95e0e5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=AE=81?= Date: Fri, 29 Dec 2023 09:38:14 +0000 Subject: [PATCH 01/11] =?UTF-8?q?!226=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E8=AF=B4=E6=98=8E=20Modification=E3=80=91=20=E5=8D=87=E7=BA=A7?= =?UTF-8?q?npu-exporter=20Merge=20pull=20request=20!226=20from=20=E6=9D=8E?= =?UTF-8?q?=E5=AE=81/branch=5Fv5.0.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- go.mod | 4 ++-- go.sum | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/go.mod b/go.mod index 64b8d633..f8717d7f 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,7 @@ require ( github.com/fsnotify/fsnotify v1.6.0 github.com/smartystreets/goconvey v1.7.2 google.golang.org/grpc v1.57.2 - huawei.com/npu-exporter/v5 v5.0.0-RC4.b002 + huawei.com/npu-exporter/v5 v5.0.0 k8s.io/api v0.25.13 k8s.io/apimachinery v0.25.13 k8s.io/client-go v0.25.13 @@ -75,7 +75,7 @@ require ( replace ( github.com/containernetworking/cni => github.com/containernetworking/cni v0.8.1 github.com/gorilla/websocket => github.com/gorilla/websocket v1.4.2 - huawei.com/npu-exporter/v5 => gitee.com/ascend/ascend-npu-exporter/v5 v5.0.0-RC4.b002 + huawei.com/npu-exporter/v5 => gitee.com/ascend/ascend-npu-exporter/v5 v5.0.0 k8s.io/api => k8s.io/api v0.25.13 k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.25.13 k8s.io/apimachinery => k8s.io/apimachinery v0.25.13 diff --git a/go.sum b/go.sum index 62f13e67..5e8489e8 100644 --- a/go.sum +++ b/go.sum @@ -1,6 +1,6 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= -gitee.com/ascend/ascend-npu-exporter/v5 v5.0.0-RC4.b002 h1:t593tTlEbnhzOYlwsA9NtbqvoB4E/NA7lkbcPJsbvmw= -gitee.com/ascend/ascend-npu-exporter/v5 v5.0.0-RC4.b002/go.mod h1:uAjPsLJAJ89j/+yMXwoiEhQ037443MoTfG1afXXXdHc= +gitee.com/ascend/ascend-npu-exporter/v5 v5.0.0 h1:hA93VQJXkYbkLjwNlIwL0C6krpIhsQSqIJ3+l+fiD9Q= +gitee.com/ascend/ascend-npu-exporter/v5 v5.0.0/go.mod h1:uAjPsLJAJ89j/+yMXwoiEhQ037443MoTfG1afXXXdHc= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/Microsoft/go-winio v0.4.17 h1:iT12IBVClFevaf8PuVyi3UmZOVh4OqnaLxDTW2O6j3w= github.com/Microsoft/go-winio v0.4.17/go.mod h1:JPGBdM1cNvN/6ISo+n8V5iA4v8pBzdOpzfwIujj1a84= -- Gitee From 1af2cc62f14109dc75c7f7c07ec4a38fa332ca3d Mon Sep 17 00:00:00 2001 From: piyuwei Date: Tue, 30 Jan 2024 19:19:15 +0800 Subject: [PATCH 02/11] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E8=AF=BB=E5=8F=96faultcode?= =?UTF-8?q?=E6=97=B6=E5=A6=82=E6=9E=9C=E9=81=87=E5=88=B0=E6=97=A0=E6=B3=95?= =?UTF-8?q?=E8=A7=A3=E6=9E=90=E7=9A=84=E5=AD=97=E6=AE=B5=EF=BC=8C=E8=B7=B3?= =?UTF-8?q?=E8=BF=87=E5=B9=B6=E7=BB=A7=E7=BB=AD=E5=A4=84=E7=90=86=E5=90=8E?= =?UTF-8?q?=E9=9D=A2=E7=9A=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/common/slice_common.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/common/slice_common.go b/pkg/common/slice_common.go index 79cb317d..90535888 100644 --- a/pkg/common/slice_common.go +++ b/pkg/common/slice_common.go @@ -124,8 +124,8 @@ func (s stringTool) HexStringToInt(sources []string) []int64 { for _, source := range sources { num, err := strconv.ParseInt(source, Hex, 0) if err != nil { - hwlog.RunLog.Errorf("parse hex int failed , string: %s", source) - return nil + hwlog.RunLog.Errorf("parse hex int failed and skip it, string: %s", source) + continue } intSlice = append(intSlice, num) } -- Gitee From 64f38e4e9b6b0dcad57deaa2a98c1c70e0efe1e0 Mon Sep 17 00:00:00 2001 From: piyuwei Date: Tue, 30 Jan 2024 19:58:54 +0800 Subject: [PATCH 03/11] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E6=8A=8AHCCS=E9=85=8D?= =?UTF-8?q?=E7=BD=AE=E9=94=99=E8=AF=AF=E4=BB=8EL6=E5=8F=98=E6=9B=B4?= =?UTF-8?q?=E4=B8=BAL1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- build/faultCode.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build/faultCode.json b/build/faultCode.json index 4c2e31e1..bd114755 100644 --- a/build/faultCode.json +++ b/build/faultCode.json @@ -8,7 +8,7 @@ "81938006","81938004","81478006","81478004","813B8006","813B8004","81578006","81578004","81958006","81958004", "81078603","8C2FA009","A4025021","A60250C1","A4025081","A214000D","A414000D","A4028801","A4025101","A2140007", "A4140007","A2140008","A4140008","A40250E1","A214000A","A414000A","A4025061","A4025041","A214000B","A414000B", - "A414000C","A2140009","A4140009","A4303002" + "A414000C","A2140009","A4140009","A4303002","819B8003" ], "RestartRequestCodes":[ "80C98008","80C98002","80C98003","80C98009","80CB8002","80CB8008","80CB8009","80CF8003","81318008","80D58000", @@ -41,7 +41,7 @@ "80E3A201","80E18402","80E0020B","81978002","815F8002","81338002","817F8002","816F8002","814F8002","81938002", "81478002","813B8002","81578002","81958002","9419321B","A2301000","A2301001","A2302001","A4192C1A","A4193216", "A419321B","A419321C","A42F390F","A42F3916","A42F391A","A6183207","A62F3934","A8028801","A819320F","A8193234", - "A8193235","819B8003" + "A8193235" ], "PreSeparateNPUCodes":[] } \ No newline at end of file -- Gitee From 8a80f45c223a77e68c43c0bf0a2d3094ea99feac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=AE=8B=E7=9B=B8=E5=85=B5?= Date: Thu, 1 Feb 2024 03:24:00 +0000 Subject: [PATCH 04/11] =?UTF-8?q?!238=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E8=AF=B4=E6=98=8E=20Modification=E3=80=91=E6=9B=B4=E6=96=B0ann?= =?UTF-8?q?otation=E5=A4=B1=E8=B4=A5=E5=90=8E=E5=B0=9D=E8=AF=95patch?= =?UTF-8?q?=E6=9B=B4=E6=96=B0--5.0.0=20*=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E8=AF=B4=E6=98=8E=20Modification=E3=80=91=E6=9B=B4=E6=96=B0ann?= =?UTF-8?q?otation=E5=A4=B1=E8=B4=A5=E5=90=8E=E5=B0=9D=E8=AF=95patch?= =?UTF-8?q?=E6=9B=B4=E6=96=B0--5.0.0=20*=20add?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- build/ascendplugin-310-volcano.yaml | 2 +- build/ascendplugin-310.yaml | 2 +- build/ascendplugin-310P-1usoc-volcano.yaml | 2 +- build/ascendplugin-310P-1usoc.yaml | 2 +- build/ascendplugin-310P-volcano.yaml | 2 +- build/ascendplugin-310P.yaml | 2 +- build/ascendplugin-910.yaml | 2 +- build/ascendplugin-volcano.yaml | 2 +- pkg/common/constants.go | 2 + pkg/kubeclient/client_server.go | 47 ++++++++++++++++++---- pkg/kubeclient/client_server_test.go | 4 +- pkg/kubeclient/kubeclient.go | 6 +++ 12 files changed, 58 insertions(+), 17 deletions(-) diff --git a/build/ascendplugin-310-volcano.yaml b/build/ascendplugin-310-volcano.yaml index c3555941..00de3b13 100644 --- a/build/ascendplugin-310-volcano.yaml +++ b/build/ascendplugin-310-volcano.yaml @@ -11,7 +11,7 @@ metadata: rules: - apiGroups: [""] resources: ["pods"] - verbs: ["get", "list", "update", "watch"] + verbs: ["get", "list", "update", "watch", "patch"] - apiGroups: [""] resources: ["nodes"] verbs: ["get", "patch"] diff --git a/build/ascendplugin-310.yaml b/build/ascendplugin-310.yaml index 4befaf14..2d2c71ee 100644 --- a/build/ascendplugin-310.yaml +++ b/build/ascendplugin-310.yaml @@ -11,7 +11,7 @@ metadata: rules: - apiGroups: [""] resources: ["pods"] - verbs: ["get", "list", "update", "watch"] + verbs: ["get", "list", "update", "watch", "patch"] - apiGroups: [""] resources: ["nodes"] verbs: ["get", "patch"] diff --git a/build/ascendplugin-310P-1usoc-volcano.yaml b/build/ascendplugin-310P-1usoc-volcano.yaml index 77651f68..46a3c575 100644 --- a/build/ascendplugin-310P-1usoc-volcano.yaml +++ b/build/ascendplugin-310P-1usoc-volcano.yaml @@ -11,7 +11,7 @@ metadata: rules: - apiGroups: [""] resources: ["pods"] - verbs: ["get", "list", "update", "watch"] + verbs: ["get", "list", "update", "watch", "patch"] - apiGroups: [""] resources: ["nodes"] verbs: ["get", "patch"] diff --git a/build/ascendplugin-310P-1usoc.yaml b/build/ascendplugin-310P-1usoc.yaml index 77651f68..46a3c575 100644 --- a/build/ascendplugin-310P-1usoc.yaml +++ b/build/ascendplugin-310P-1usoc.yaml @@ -11,7 +11,7 @@ metadata: rules: - apiGroups: [""] resources: ["pods"] - verbs: ["get", "list", "update", "watch"] + verbs: ["get", "list", "update", "watch", "patch"] - apiGroups: [""] resources: ["nodes"] verbs: ["get", "patch"] diff --git a/build/ascendplugin-310P-volcano.yaml b/build/ascendplugin-310P-volcano.yaml index 5f9371f0..3bd99537 100644 --- a/build/ascendplugin-310P-volcano.yaml +++ b/build/ascendplugin-310P-volcano.yaml @@ -11,7 +11,7 @@ metadata: rules: - apiGroups: [""] resources: ["pods"] - verbs: ["get", "list", "update", "watch"] + verbs: ["get", "list", "update", "watch", "patch"] - apiGroups: [""] resources: ["nodes"] verbs: ["get", "patch"] diff --git a/build/ascendplugin-310P.yaml b/build/ascendplugin-310P.yaml index c68e6fd4..58db0bec 100644 --- a/build/ascendplugin-310P.yaml +++ b/build/ascendplugin-310P.yaml @@ -11,7 +11,7 @@ metadata: rules: - apiGroups: [""] resources: ["pods"] - verbs: ["get", "list", "update", "watch"] + verbs: ["get", "list", "update", "watch", "patch"] - apiGroups: [""] resources: ["nodes"] verbs: ["get", "patch"] diff --git a/build/ascendplugin-910.yaml b/build/ascendplugin-910.yaml index 44bcb577..e527326a 100644 --- a/build/ascendplugin-910.yaml +++ b/build/ascendplugin-910.yaml @@ -11,7 +11,7 @@ metadata: rules: - apiGroups: [""] resources: ["pods"] - verbs: ["get", "list", "update", "watch"] + verbs: ["get", "list", "update", "watch", "patch"] - apiGroups: [""] resources: ["nodes"] verbs: ["get"] diff --git a/build/ascendplugin-volcano.yaml b/build/ascendplugin-volcano.yaml index baf3a27d..635901e2 100644 --- a/build/ascendplugin-volcano.yaml +++ b/build/ascendplugin-volcano.yaml @@ -11,7 +11,7 @@ metadata: rules: - apiGroups: [""] resources: ["pods"] - verbs: ["get", "list", "update", "watch"] + verbs: ["get", "list", "update", "watch", "patch"] - apiGroups: [""] resources: ["nodes"] verbs: ["get"] diff --git a/pkg/common/constants.go b/pkg/common/constants.go index eebc175c..ac159e7f 100644 --- a/pkg/common/constants.go +++ b/pkg/common/constants.go @@ -82,6 +82,8 @@ const ( Pod910DeviceKey = "ascend.kubectl.kubernetes.io/ascend-910-configuration" // MetaDataAnnotation downward api which map annotation from volcano to container's env MetaDataAnnotation = "metadata.annotations" + // MetaData is meta data of pod + MetaData = "metadata" // PodResourceSeverKey for pod resource key PodResourceSeverKey = "podResource" diff --git a/pkg/kubeclient/client_server.go b/pkg/kubeclient/client_server.go index 2d59168d..0f4a056b 100644 --- a/pkg/kubeclient/client_server.go +++ b/pkg/kubeclient/client_server.go @@ -16,6 +16,7 @@ package kubeclient import ( + "encoding/json" "fmt" "net" "strconv" @@ -31,36 +32,68 @@ import ( "Ascend-device-plugin/pkg/common" ) +// Similar to the K8s metadata structure +type metaData struct { + Annotations map[string]string `json:"annotations"` +} + +type podMetaData map[string]metaData + var tryUpdatePodWaitTime = 200 * time.Millisecond -var deviceInfoFlushTime = int64(60 * 60) -// TryUpdatePodAnnotation is to try updating pod annotation -func (ki *ClientK8s) TryUpdatePodAnnotation(pod *v1.Pod, annotation map[string]string) error { +// PatchPodAnnotation update pod annotation by patch +func (ki *ClientK8s) PatchPodAnnotation(pod *v1.Pod, annotation map[string]string) error { if pod == nil { - return fmt.Errorf("param pod is nil") + return fmt.Errorf("pod is nil") + } + newPodMetaData := podMetaData{common.MetaData: metaData{Annotations: annotation}} + podUpdateMetaData, err := json.Marshal(newPodMetaData) + if err != nil { + hwlog.RunLog.Errorf("failed to marshal the node status data, error is %v", err) + return err + } + if _, err = ki.PatchPod(pod, podUpdateMetaData); err != nil { + return err } + + return nil +} + +// TryUpdatePodAnnotation is to try updating pod annotation +func (ki *ClientK8s) TryUpdatePodAnnotation(pod *v1.Pod, annotation map[string]string) error { if annotation == nil { return fmt.Errorf("invalid annotation") } if pod.Annotations == nil { pod.Annotations = make(map[string]string, len(annotation)) } + + var err error for i := 0; i < common.RetryUpdateCount; i++ { if pod.Name != "" { for k, v := range annotation { pod.Annotations[k] = v } - _, err := ki.UpdatePod(pod) + _, err = ki.UpdatePod(pod) if err == nil { return nil } - hwlog.RunLog.Debugf("update pod annotation failed, times: %d, error is %v", i+1, err) + hwlog.RunLog.Warnf("update pod annotation by updating method failed, times: %d, error is %v", i+1, err) + if errors.IsConflict(err) { + hwlog.RunLog.Info("try to patch annotation") + err = ki.PatchPodAnnotation(pod, annotation) + if err == nil { + return nil + } + hwlog.RunLog.Errorf("patch annotation failed: %v", err) + } } time.Sleep(tryUpdatePodWaitTime) podNew := ki.GetPodCache(pod.Namespace, pod.Name) pod = &podNew } - return fmt.Errorf("update pod annotation failed, exceeded max number of retries") + + return fmt.Errorf("update pod annotation failed") } // TryUpdatePodCacheAnnotation is to try updating pod annotation in both api server and cache diff --git a/pkg/kubeclient/client_server_test.go b/pkg/kubeclient/client_server_test.go index 17ec68e9..49d79142 100644 --- a/pkg/kubeclient/client_server_test.go +++ b/pkg/kubeclient/client_server_test.go @@ -186,14 +186,14 @@ func TestTryUpdatePodAnnotation(t *testing.T) { mockGetPod := mockGetPodOpr(v1.Pod{}) defer mockGetPod.Reset() err := utKubeClient.TryUpdatePodAnnotation(testPod, getDeviceInfo(common.HuaweiAscend310P, npuChip310PPhyID0)) - convey.So(err.Error(), convey.ShouldEqual, "update pod annotation failed, exceeded max number of retries") + convey.So(err.Error(), convey.ShouldEqual, "update pod annotation failed") }) convey.Convey("try update pod annotation when get pod is not nil", t, func() { mockGetPod := mockGetPodOpr(*testPod) defer mockGetPod.Reset() err := utKubeClient.TryUpdatePodAnnotation(testPod, getDeviceInfo(common.HuaweiAscend310P, npuChip310PPhyID0)) - convey.So(err.Error(), convey.ShouldEqual, "update pod annotation failed, exceeded max number of retries") + convey.So(err.Error(), convey.ShouldEqual, "update pod annotation failed") }) } diff --git a/pkg/kubeclient/kubeclient.go b/pkg/kubeclient/kubeclient.go index becb10dc..59b10886 100644 --- a/pkg/kubeclient/kubeclient.go +++ b/pkg/kubeclient/kubeclient.go @@ -110,6 +110,12 @@ func (ki *ClientK8s) UpdatePod(pod *v1.Pod) (*v1.Pod, error) { return v1Pod, err } +// PatchPod patch pod information +func (ki *ClientK8s) PatchPod(pod *v1.Pod, data []byte) (*v1.Pod, error) { + return ki.Clientset.CoreV1().Pods(pod.Namespace).Patch(context.Background(), + pod.Name, types.StrategicMergePatchType, data, metav1.PatchOptions{}) +} + // GetActivePodList is to get active pod list func (ki *ClientK8s) GetActivePodList() ([]v1.Pod, error) { fieldSelector, err := fields.ParseSelector("spec.nodeName=" + ki.NodeName + "," + -- Gitee From 67406900a0411e4f78faef780ab7b0df61f46cf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=AE=8B=E7=9B=B8=E5=85=B5?= Date: Wed, 28 Feb 2024 11:50:16 +0800 Subject: [PATCH 05/11] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E5=A2=9E=E5=8A=A0=E6=96=B0?= =?UTF-8?q?=E7=9A=84=E6=95=85=E9=9A=9C=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- build/faultCode.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/faultCode.json b/build/faultCode.json index bd114755..6c80a77a 100644 --- a/build/faultCode.json +++ b/build/faultCode.json @@ -41,7 +41,7 @@ "80E3A201","80E18402","80E0020B","81978002","815F8002","81338002","817F8002","816F8002","814F8002","81938002", "81478002","813B8002","81578002","81958002","9419321B","A2301000","A2301001","A2302001","A4192C1A","A4193216", "A419321B","A419321C","A42F390F","A42F3916","A42F391A","A6183207","A62F3934","A8028801","A819320F","A8193234", - "A8193235" + "A8193235","80818c00" ], "PreSeparateNPUCodes":[] } \ No newline at end of file -- Gitee From accae79b897e512f09436d0995a772ff3f09eeea Mon Sep 17 00:00:00 2001 From: Light-Alex <245212467@qq.com> Date: Fri, 29 Mar 2024 03:48:27 +0000 Subject: [PATCH 06/11] =?UTF-8?q?!263=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E8=AF=B4=E6=98=8E=20Modification=E3=80=91kubernetes=201.25.13?= =?UTF-8?q?=E7=89=88=E6=9C=AC=E5=AD=98=E5=9C=A8=E9=AB=98=E5=8D=B1=E6=BC=8F?= =?UTF-8?q?=E6=B4=9E=EF=BC=8C=E5=8D=87=E7=BA=A7=E7=89=88=E6=9C=AC=E5=88=B0?= =?UTF-8?q?1.25.14=20*=20kubernetes=201.25.13=E7=89=88=E6=9C=AC=E5=AD=98?= =?UTF-8?q?=E5=9C=A8=E9=AB=98=E5=8D=B1=E6=BC=8F=E6=B4=9E=EF=BC=8C=E5=8D=87?= =?UTF-8?q?=E7=BA=A7=E7=89=88=E6=9C=AC=E5=88=B01.25.14?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index f8717d7f..61d45a7f 100644 --- a/go.mod +++ b/go.mod @@ -13,7 +13,7 @@ require ( k8s.io/client-go v0.25.13 k8s.io/component-helpers v0.25.13 k8s.io/kubelet v0.25.13 - k8s.io/kubernetes v1.25.13 + k8s.io/kubernetes v1.25.14 k8s.io/utils v0.0.0-20230209194617-a36077c30491 ) diff --git a/go.sum b/go.sum index 5e8489e8..bfae24a9 100644 --- a/go.sum +++ b/go.sum @@ -261,8 +261,8 @@ k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1 h1:MQ8BAZPZlWk3S9K4a9NCkI k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1/go.mod h1:C/N6wCaBHeBHkHUesQOQy2/MZqGgMAFPqGsGQLdbZBU= k8s.io/kubelet v0.25.13 h1:MmUjqzfY/sSY0RLFynDPJaSwH2dqHLFrAypY7HZMrY0= k8s.io/kubelet v0.25.13/go.mod h1:cm2OAyE4cqOgd2FNJzktOd/2X4oTHlfqtMUOqJ0Okfg= -k8s.io/kubernetes v1.25.13 h1:3q5BQlaI1xkvj4/K50ZjjYwxRWF5ZRcyzuGaDYRiZO8= -k8s.io/kubernetes v1.25.13/go.mod h1:CjSm5tJyKxHyGhK5aAID68YMErzDhAjWLL7Nd7NMonU= +k8s.io/kubernetes v1.25.14 h1:pRsCw27F6te3LE9Gu1ADTS5/5mgUQREfcQajvDu/qOg= +k8s.io/kubernetes v1.25.14/go.mod h1:CjSm5tJyKxHyGhK5aAID68YMErzDhAjWLL7Nd7NMonU= k8s.io/utils v0.0.0-20230209194617-a36077c30491 h1:r0BAOLElQnnFhE/ApUsg3iHdVYYPBjNSSOMowRZxxsY= k8s.io/utils v0.0.0-20230209194617-a36077c30491/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 h1:iXTIw73aPyC+oRdyqqvVJuloN1p0AC/kzH07hu3NE+k= -- Gitee From 7b7c18649439d3d4b9cda3ccbc8df898e6dfdf4f Mon Sep 17 00:00:00 2001 From: piyuwei Date: Wed, 17 Apr 2024 06:14:52 +0000 Subject: [PATCH 07/11] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E9=87=8D=E8=A6=81/?= =?UTF-8?q?=E7=B4=A7=E6=80=A5=E5=91=8A=E8=AD=A6=E5=9C=A8=E6=9C=AA=E9=85=8D?= =?UTF-8?q?=E7=BD=AE=E7=9A=84=E6=83=85=E5=86=B5=E4=B8=8B=E4=B8=8A=E6=8A=A5?= =?UTF-8?q?=E4=B8=BAL1=E8=80=8C=E9=9D=9EL6=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: piyuwei --- pkg/common/fault_code.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/common/fault_code.go b/pkg/common/fault_code.go index 8d90891e..3c44cac0 100644 --- a/pkg/common/fault_code.go +++ b/pkg/common/fault_code.go @@ -635,7 +635,7 @@ func SaveDevFaultInfo(devFaultInfo common.DevFaultInfo) { if devFaultInfo.EventID == 0 { return } - faultSeverityMap[devFaultInfo.EventID] = devFaultInfo.Assertion + faultSeverityMap[devFaultInfo.EventID] = devFaultInfo.Severity devFaultInfoMapLock.Lock() devFaultInfoMap[devFaultInfo.LogicID] = append(devFaultInfoMap[devFaultInfo.LogicID], devFaultInfo) devFaultInfoMapLock.Unlock() -- Gitee From a8cd10e8152cf26b73f35de2d3ade413c2c8fc3d Mon Sep 17 00:00:00 2001 From: Light-Alex <245212467@qq.com> Date: Thu, 18 Apr 2024 12:34:11 +0000 Subject: [PATCH 08/11] =?UTF-8?q?!276=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E8=AF=B4=E6=98=8E=20Modification=E3=80=91=E4=BF=AE=E5=A4=8D?= =?UTF-8?q?=E9=9D=9E0=E5=8D=A1=E6=95=85=E9=9A=9C=E7=83=AD=E5=A4=8D?= =?UTF-8?q?=E4=BD=8D=E5=90=8E=E7=BD=91=E7=BB=9C=E9=97=AE=E9=A2=98=EF=BC=8C?= =?UTF-8?q?=E5=9B=9E=E5=90=885.0.0=E5=88=86=E6=94=AF=20*=20=E4=BF=AE?= =?UTF-8?q?=E5=A4=8D=E9=9D=9E0=E5=8D=A1=E6=95=85=E9=9A=9C=E7=83=AD?= =?UTF-8?q?=E5=A4=8D=E4=BD=8D=E5=90=8E=E7=BD=91=E7=BB=9C=E9=97=AE=E9=A2=98?= =?UTF-8?q?=EF=BC=8C=E5=9B=9E=E5=90=885.0.0=E5=88=86=E6=94=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- build/faultCustomization.json | 2 +- pkg/common/constants.go | 18 +--- pkg/device/ascend910.go | 136 +++++++++++++++++----------- pkg/device/ascendtolerance.go | 33 +++++-- pkg/device/ascendtorlerance_test.go | 11 +-- 5 files changed, 118 insertions(+), 82 deletions(-) diff --git a/build/faultCustomization.json b/build/faultCustomization.json index fc097598..e9f74b11 100644 --- a/build/faultCustomization.json +++ b/build/faultCustomization.json @@ -1,7 +1,7 @@ { "GraceTolerance": { "WaitFlushingCMTime": 90, - "WaitDeviceResetTime": 60 + "WaitDeviceResetTime": 120 }, "FaultFrequency": [ { diff --git a/pkg/common/constants.go b/pkg/common/constants.go index ac159e7f..84efd6a3 100644 --- a/pkg/common/constants.go +++ b/pkg/common/constants.go @@ -152,6 +152,8 @@ const ( const ( // ResourceNamePrefix prefix ResourceNamePrefix = "huawei.com/" + // DistributedJob annotation indicates that the job is distributed + DistributedJob = "distributed-job" // Ascend310P 310p Ascend310P = "Ascend310P" // Ascend310PV 310P-V @@ -552,24 +554,10 @@ const ( ) const ( - // LeftRingOf910 means id contains [0, 1, 2, 3] - LeftRingOf910 = 0 - // RightRingOf910 means id contains [4, 5, 6, 7] - RightRingOf910 = 1 - // RingOf910B means id contains [0, 1, 2, 3, 4, 5, 6, 7] - RingOf910B = 2 // MaxResetWaitRecoverTime max reset wait chip recover time is 150s MaxResetWaitRecoverTime = 150 ) -// LogicID list for reset, get id list of ring -const ( - LogicID0 = 0 - LogicID3 = 3 - LogicID4 = 4 - LogicID7 = 7 -) - // Fault customization const const ( // PollFaultCodeCMInterval is the default interval(second) of polling fault code CM @@ -595,7 +583,7 @@ const ( // MinWaitFlushCMTime for min time waiting for cm info to flush in container MinWaitFlushCMTime = 90 // DefaultWaitDeviceResetTime is the default time used in waiting device reset - DefaultWaitDeviceResetTime = 60 + DefaultWaitDeviceResetTime = 120 // MaxWaitDeviceResetTime is the max time used in waiting device reset MaxWaitDeviceResetTime = 120 // MinWaitDeviceResetTime is the min time used in waiting device reset diff --git a/pkg/device/ascend910.go b/pkg/device/ascend910.go index 4fb9fb34..5d1dd473 100644 --- a/pkg/device/ascend910.go +++ b/pkg/device/ascend910.go @@ -70,6 +70,7 @@ func (hnm *HwAscend910Manager) GetNPUs() (common.NpuAllInfo, error) { if err != nil { return common.NpuAllInfo{}, err } + hwlog.RunLog.Infof("The total number of devices at this node: %v", devNum) if devNum > hnm.devCount { return common.NpuAllInfo{}, fmt.Errorf("invalid device num: %d", devNum) } @@ -324,14 +325,12 @@ func (hnm *HwAscend910Manager) setTaskDevInfoCache() error { if hnm.isReSchedulingScene(len(devIdList)) { continue } - taskName, ok := pod.Annotations[common.ResetTaskNameKey] - if !ok { - taskName, ok = pod.Labels[common.ResetTaskNameKeyInLabel] - if !ok { - hwlog.RunLog.Error("failed to get task name by task key") - continue - } + + taskName := hnm.hotResetManager.GetTaskNameByPod(pod) + if taskName == "" { + continue } + rankIndex, ok := pod.Annotations[common.RankIndexKey] if common.ParamOption.RealCardType == common.Ascend910B && hnm.GetDeviceUsage() == common.Infer { rankIndex = common.InferRankIndex @@ -400,13 +399,10 @@ func (hnm *HwAscend910Manager) convertLogicIdToPhysicId(logicIds []int32) (physi } func (hnm *HwAscend910Manager) isReSchedulingScene(npuCount int) bool { - if common.ParamOption.RealCardType == common.Ascend910 && npuCount < common.Ascend910RingsNum { - return true - } - if common.ParamOption.RealCardType == common.Ascend910B && hnm.GetDeviceUsage() == common.Train && - npuCount < common.Ascend910BRingsNumTrain { + if hnm.GetDeviceUsage() == common.Train && npuCount < hnm.hotResetManager.GetRingNum() { return true } + return false } @@ -517,6 +513,8 @@ func (hnm *HwAscend910Manager) processAllTask() error { if resetFlag, err := hnm.isTaskInReset(taskName); err != nil || resetFlag { if resetFlag && !hnm.hotResetManager.IsCurNodeTaskInReset(taskName) && hnm.hotResetManager.IsExistFaultyDevInTask(taskName) { + hwlog.RunLog.Infof("task %s is in reset process which is not performed on current node. NPU"+ + " faults occurred on current node, the task's process policy will be marked as isolate", taskName) go hnm.tryWriteIsolationInfo(taskName) } continue @@ -763,12 +761,12 @@ func (hnm *HwAscend910Manager) resetProcess(taskName string, resetInfo *common.T // upgradeResetProcess upgrade the device reset processing to the device isolation processing func (hnm *HwAscend910Manager) upgradeResetProcess(taskName string, devFaultInfoList []*common.TaskDevInfo) error { - resultFaultInfoList, err := hnm.hotResetManager.GetNeedResetDevList(devFaultInfoList) + resetFaultInfoMap, err := hnm.hotResetManager.GetNeedResetDevMap(devFaultInfoList) if err != nil { hwlog.RunLog.Errorf("failed to get need reset device list, err: %v", err) return err } - if len(resultFaultInfoList) == 0 { + if len(resetFaultInfoMap) == 0 { return nil } if err := hnm.updateResetCMStatus(taskName, common.IsolateError, common.ResetError, common.RecoverFailedStatus, @@ -843,7 +841,7 @@ func (hnm *HwAscend910Manager) refreshDevFaultInfo(devFaultInfo []*common.TaskDe for _, devInfo := range devFaultInfo { _, errorCode, err := hnm.GetDmgr().GetDeviceAllErrorCode(devInfo.LogicId) if err != nil { - hwlog.RunLog.Errorf("failed to get device %d healthy", devInfo.LogicId) + hwlog.RunLog.Errorf("failed to get err code of device %d", devInfo.LogicId) return err } devInfo.Policy = hnm.hotResetManager.GetDevProcessPolicy(common.GetFaultType(errorCode, devInfo.LogicId)) @@ -853,12 +851,12 @@ func (hnm *HwAscend910Manager) refreshDevFaultInfo(devFaultInfo []*common.TaskDe } func (hnm *HwAscend910Manager) resetDeviceOnce(devFaultInfoList []*common.TaskDevInfo) error { - resetFaultInfoList, err := hnm.hotResetManager.GetNeedResetDevList(devFaultInfoList) + resetFaultInfoMap, err := hnm.hotResetManager.GetNeedResetDevMap(devFaultInfoList) if err != nil { hwlog.RunLog.Errorf("failed to get need reset device list, err: %v", err) return err } - if err := hnm.execResetDevice(resetFaultInfoList); err != nil { + if err := hnm.execResetDevice(resetFaultInfoMap); err != nil { hwlog.RunLog.Errorf("failed to exec reset device list, err: %v", err) return err } @@ -872,24 +870,25 @@ func (hnm *HwAscend910Manager) resetDeviceOnce(devFaultInfoList []*common.TaskDe return nil } -func (hnm *HwAscend910Manager) execResetDevice(devList map[int32]struct{}) error { - errList := make([]error, 0, len(devList)) - for devLogicId := range devList { - cardId, deviceId, err := hnm.GetDmgr().GetCardIDDeviceID(devLogicId) +func (hnm *HwAscend910Manager) execResetDevice(devMap map[int32]int32) error { + errList := make([]error, 0, len(devMap)) + for resetLogicId, faultLogicId := range devMap { + cardId, deviceId, err := hnm.GetDmgr().GetCardIDDeviceID(resetLogicId) if err != nil { hwlog.RunLog.Errorf("failed to get reset device card id and device id, err %v", err) return err } + shouldCheckNet := hnm.isShouldCheckNet(faultLogicId) if err := hnm.tryResetDevice(cardId, deviceId); err != nil { errList = append(errList, err) continue } // wait for the device to reset completely - if err := hnm.isRingResetComplete(devLogicId); err != nil { + if err := hnm.isRingResetComplete(resetLogicId, shouldCheckNet); err != nil { errList = append(errList, err) continue } - hwlog.RunLog.Infof("hot reset complete, cardId: %d, logicId: %d", cardId, devLogicId) + hwlog.RunLog.Infof("hot reset complete, cardId: %d, logicId: %d", cardId, resetLogicId) } if len(errList) == 0 { return nil @@ -897,12 +896,28 @@ func (hnm *HwAscend910Manager) execResetDevice(devList map[int32]struct{}) error return errList[0] } -func (hnm *HwAscend910Manager) waitDeviceResetComplete(logicId int32, totalTime *int) error { +func (hnm *HwAscend910Manager) isNetResetCompleted(logicId int32) bool { + netStatus, err := hnm.GetDmgr().GetDeviceNetWorkHealth(logicId) + if err != nil { + hwlog.RunLog.Warnf("get net status of %v error: %v", logicId, err) + return false + } + switch netStatus { + case networkDetectOK, networkDetectInit: + return true + default: + hwlog.RunLog.Warnf("%d network status is unhealthy, health code is %d", logicId, netStatus) + return false + } +} + +func (hnm *HwAscend910Manager) waitDeviceResetComplete(logicId int32, totalTime *int, shouldCheckNet bool) error { if err := wait.PollImmediate(time.Second, common.WaitDeviceResetTime*time.Second, func() (bool, error) { *totalTime += 1 if *totalTime > common.MaxResetWaitRecoverTime { return true, fmt.Errorf("wait device reset recover timeout") } + hwlog.RunLog.Infof("start to check card %d boot status", logicId) bootState, err := hnm.GetDmgr().GetDeviceBootStatus(logicId) if err != nil { hwlog.RunLog.Errorf("get device boot status failed, logic id: %d, err: %v", logicId, err) @@ -912,7 +927,13 @@ func (hnm *HwAscend910Manager) waitDeviceResetComplete(logicId int32, totalTime hwlog.RunLog.Debugf("device bootState(%d), starting...", bootState) return false, nil } - return true, nil + hwlog.RunLog.Infof("card %d start finish", logicId) + + if !shouldCheckNet { + return true, nil + } + + return hnm.isNetResetCompleted(logicId), nil }); err != nil { hwlog.RunLog.Errorf("hot reset failed, timeout or err: %v, logic id: %d", err, logicId) return err @@ -920,46 +941,48 @@ func (hnm *HwAscend910Manager) waitDeviceResetComplete(logicId int32, totalTime return nil } -func (hnm *HwAscend910Manager) isRingResetComplete(oriLogicID int32) error { - var ringType = -1 - switch common.ParamOption.RealCardType { - case common.Ascend910: - ringType = common.LeftRingOf910 - if oriLogicID >= common.Ascend910RingsNum { - ringType = common.RightRingOf910 - } - case common.Ascend910B: - ringType = common.RingOf910B +// isRunningDistributed returns true when volcano update 'distributed-job=true' to pod +func (hnm *HwAscend910Manager) isRunningDistributed(logicID int32) bool { + podMap, err := hnm.hotResetManager.GetFaultDev2PodMap() + if err != nil { + hwlog.RunLog.Errorf("failed to get pod while checking the task running mode of dev %v: %v", logicID, err) + return false + } + + pod, ok := podMap[logicID] + if !ok { + hwlog.RunLog.Errorf("no task running on device %v", logicID) + return false } - if ringType == -1 { - hwlog.RunLog.Error("device type cannot be identified") - return fmt.Errorf("device type cannot be identified") + + isDistributed, ok := pod.Annotations[common.DistributedJob] + if !ok { + return false } - // totalTime, statistic chip reset recover time + + return isDistributed == "true" +} + +func (hnm *HwAscend910Manager) isShouldCheckNet(logicID int32) bool { + // there is no need to check status of network, when running a single node task + return hnm.isRunningDistributed(logicID) +} + +func (hnm *HwAscend910Manager) isRingResetComplete(oriLogicID int32, shouldCheckNet bool) error { var totalTime int - leftRingID, rightRingID := hnm.getLogicIDOnRing(ringType) - for ringLogicID := leftRingID; ringLogicID <= rightRingID; ringLogicID++ { - if err := hnm.waitDeviceResetComplete(ringLogicID, &totalTime); err != nil { + resetStartLogicID := oriLogicID / int32(getChipCountOnRing()) * int32(getChipCountOnRing()) + for logicID := resetStartLogicID; logicID < resetStartLogicID+int32(getChipCountOnRing()); logicID++ { + if err := hnm.waitDeviceResetComplete(logicID, &totalTime, shouldCheckNet); err != nil { return err } } return nil } -func (hnm *HwAscend910Manager) getLogicIDOnRing(ringType int) (int32, int32) { - switch ringType { - case common.LeftRingOf910: - return common.LogicID0, common.LogicID3 - case common.RightRingOf910: - return common.LogicID4, common.LogicID7 - default: - return common.LogicID0, common.LogicID7 - } -} - func (hnm *HwAscend910Manager) tryResetDevice(cardId, deviceId int32) error { var realError error for i := 0; i < common.ResetRetryTimes; i++ { + hwlog.RunLog.Infof("start to execute cardId %d reset", cardId) err := hnm.GetDmgr().SetDeviceReset(cardId, deviceId) if err == nil { hwlog.RunLog.Infof("execute reset cardId %d success", cardId) @@ -979,7 +1002,12 @@ func (hnm *HwAscend910Manager) tryWriteIsolationInfo(taskName string) { hwlog.RunLog.Errorf("failed to get task device fault info list, err: %v", err) return } - if err := hnm.updateResetCMStatus(taskName, common.IsolateError, common.IsolateError, common.RecoverFailedStatus, + initPolicy, _, err := hnm.hotResetManager.GetTaskProcessPolicy(taskName) + if err != nil { + hwlog.RunLog.Errorf("failed to get task %s process policy, err: %v", taskName, err) + return + } + if err := hnm.updateResetCMStatus(taskName, common.IsolateError, initPolicy, common.RecoverFailedStatus, devFaultInfoList); err != nil { hwlog.RunLog.Errorf("failed to update reset cm to isolate, err: %v", err) } diff --git a/pkg/device/ascendtolerance.go b/pkg/device/ascendtolerance.go index 8f6ee230..67639edc 100644 --- a/pkg/device/ascendtolerance.go +++ b/pkg/device/ascendtolerance.go @@ -38,10 +38,11 @@ type HotResetManager interface { GetTaskProcessPolicy(string) (string, int, error) GetDevListInReset() map[int32]struct{} GetDevListByPolicyLevel([]*common.TaskDevInfo, int) (map[int32]struct{}, error) - GetNeedResetDevList([]*common.TaskDevInfo) (map[int32]struct{}, error) + GetNeedResetDevMap([]*common.TaskDevInfo) (map[int32]int32, error) GetTaskResetInfo([]*common.TaskDevInfo, string, string, string) (*common.TaskResetInfo, error) GetTaskFaultRankInfo([]*common.TaskDevInfo) (*common.TaskFaultInfo, error) GetFaultDev2PodMap() (map[int32]v1.Pod, error) + GetTaskNameByPod(pod v1.Pod) string GenerateTaskDevFaultInfoList(devIdList []int32, rankIndex string) ([]*common.TaskDevInfo, error) UpdateFaultDev2PodMap([]int32, v1.Pod) error UpdateGlobalDevFaultInfoCache([]*common.NpuDevice) error @@ -224,8 +225,8 @@ func (hrt *HotResetTools) GetDevListByPolicyLevel(devFaultInfoList []*common.Tas } // GetNeedResetDevList return device logic id list to be reset -func (hrt *HotResetTools) GetNeedResetDevList(devFaultInfoList []*common.TaskDevInfo) (map[int32]struct{}, error) { - needResetDevList := make(map[int32]struct{}) +func (hrt *HotResetTools) GetNeedResetDevMap(devFaultInfoList []*common.TaskDevInfo) (map[int32]int32, error) { + needResetDevMap := make(map[int32]int32) for _, devFaultInfo := range devFaultInfoList { policyType, ok := hrt.processPolicyTable[devFaultInfo.Policy] if !ok { @@ -237,12 +238,12 @@ func (hrt *HotResetTools) GetNeedResetDevList(devFaultInfoList []*common.TaskDev if policyType == common.RestartErrorLevel || policyType == common.ResetErrorLevel || policyType == common.RestartRequestErrorLevel { resetIndex := devFaultInfo.LogicId / int32(hrt.GetRingNum()) - if _, ok := needResetDevList[devFaultInfo.LogicId]; !ok { - needResetDevList[resetIndex*int32(hrt.GetRingNum())] = struct{}{} + if _, ok := needResetDevMap[devFaultInfo.LogicId]; !ok { + needResetDevMap[resetIndex*int32(hrt.GetRingNum())] = devFaultInfo.LogicId } } } - return needResetDevList, nil + return needResetDevMap, nil } // GetTaskResetInfo return the detail reset info of task to process @@ -306,6 +307,7 @@ func (hrt *HotResetTools) GetTaskFaultRankInfo(devFaultInfoList []*common.TaskDe return taskFaultInfo, nil } +// GetFaultDev2PodMap return map which contains fault device and pod func (hrt *HotResetTools) GetFaultDev2PodMap() (map[int32]v1.Pod, error) { if hrt.faultDev2PodMap == nil { return nil, fmt.Errorf("no valid faultDev2PodMap here") @@ -313,6 +315,20 @@ func (hrt *HotResetTools) GetFaultDev2PodMap() (map[int32]v1.Pod, error) { return hrt.faultDev2PodMap, nil } +// GetTaskNameByPod get task name which written by volcano or operator +func (hrt *HotResetTools) GetTaskNameByPod(pod v1.Pod) string { + taskName, ok := pod.Annotations[common.ResetTaskNameKey] + if !ok { + taskName, ok = pod.Labels[common.ResetTaskNameKeyInLabel] + if !ok { + hwlog.RunLog.Error("failed to get task name by task key") + return "" + } + } + + return taskName +} + // GenerateTaskDevFaultInfoList generate device fault info list in a task by device logic id list and rank index func (hrt *HotResetTools) GenerateTaskDevFaultInfoList(devIdList []int32, rankIndex string) ([]*common.TaskDevInfo, error) { @@ -360,6 +376,11 @@ func (hrt *HotResetTools) UpdateFaultDev2PodMap(devList []int32, pod v1.Pod) err continue } + // do not delete cache after receive recover event because device is resetting now + if _, ok := hrt.resetDev[device]; ok { + continue + } + // delete when device is healthy if _, ok := hrt.faultDev2PodMap[device]; ok { delete(hrt.faultDev2PodMap, device) diff --git a/pkg/device/ascendtorlerance_test.go b/pkg/device/ascendtorlerance_test.go index aef258a0..b419b854 100644 --- a/pkg/device/ascendtorlerance_test.go +++ b/pkg/device/ascendtorlerance_test.go @@ -323,15 +323,15 @@ func TestDevListByPolicyLevel(t *testing.T) { // TestGetNeedResetDevList for test get the needed be reseted device list func TestGetNeedResetDevList(t *testing.T) { - convey.Convey("test GetNeedResetDevList", t, func() { - convey.Convey("test GetNeedResetDevList success", func() { + convey.Convey("test GetNeedResetDevMap", t, func() { + convey.Convey("test GetNeedResetDevMap success", func() { tool := &HotResetTools{ allTaskDevFaultInfo: map[string][]*common.TaskDevInfo{"test": mockTaskDevInfoList()}, processPolicyTable: mockProcessPolicyTable(), } devFaultInfoList, ok := tool.allTaskDevFaultInfo["test"] convey.So(ok, convey.ShouldBeTrue) - devList, err := tool.GetNeedResetDevList(devFaultInfoList) + devList, err := tool.GetNeedResetDevMap(devFaultInfoList) convey.So(err, convey.ShouldBeNil) needResetDev, ok := devList[0] convey.So(needResetDev, convey.ShouldNotBeNil) @@ -339,14 +339,14 @@ func TestGetNeedResetDevList(t *testing.T) { _, ok = devList[1] convey.So(ok, convey.ShouldBeFalse) }) - convey.Convey("test GetNeedResetDevList failed", func() { + convey.Convey("test GetNeedResetDevMap failed", func() { tool := &HotResetTools{ allTaskDevFaultInfo: map[string][]*common.TaskDevInfo{"test": mockWrongTaskDevInfoList()}, processPolicyTable: mockProcessPolicyTable(), } devFaultInfoList, ok := tool.allTaskDevFaultInfo["test"] convey.So(ok, convey.ShouldBeTrue) - devList, err := tool.GetNeedResetDevList(devFaultInfoList) + devList, err := tool.GetNeedResetDevMap(devFaultInfoList) convey.So(devList, convey.ShouldBeNil) convey.So(err, convey.ShouldNotBeNil) }) @@ -820,7 +820,6 @@ func TestDeepCopyFunc(t *testing.T) { }) } -// func deepTestDevInfo(devInfo, devInfoTest *common.TaskDevInfo) { convey.So(devInfoTest, convey.ShouldNotBeNil) convey.So(devInfo, convey.ShouldNotBeNil) -- Gitee From 6620d3d5acdd63c14ea441c7d11041d294faa68a Mon Sep 17 00:00:00 2001 From: yanchuanxiang <245212467@qq.com> Date: Fri, 19 Apr 2024 09:52:32 +0800 Subject: [PATCH 09/11] =?UTF-8?q?=E6=94=AF=E6=8C=81=E6=84=9F=E7=9F=A5PCIE?= =?UTF-8?q?=E6=96=AD=E9=93=BE=E5=AF=BC=E8=87=B4=E7=9A=84=E6=8E=89=E5=8D=A1?= =?UTF-8?q?=E5=BC=82=E5=B8=B8=EF=BC=8C=E5=8F=8D=E5=90=885.0.0=E5=88=86?= =?UTF-8?q?=E6=94=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- go.mod | 4 +-- go.sum | 4 +-- pkg/common/fault_code.go | 7 +++++ pkg/common/proto.go | 1 + pkg/device/ascend910.go | 8 +++--- pkg/device/ascendcommon.go | 2 +- pkg/server/manager.go | 52 ++++++++++++++++++++++++++++++++++++++ 7 files changed, 70 insertions(+), 8 deletions(-) diff --git a/go.mod b/go.mod index 61d45a7f..af6c6517 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,7 @@ require ( github.com/fsnotify/fsnotify v1.6.0 github.com/smartystreets/goconvey v1.7.2 google.golang.org/grpc v1.57.2 - huawei.com/npu-exporter/v5 v5.0.0 + huawei.com/npu-exporter/v5 v5.0.1-B020 k8s.io/api v0.25.13 k8s.io/apimachinery v0.25.13 k8s.io/client-go v0.25.13 @@ -75,7 +75,7 @@ require ( replace ( github.com/containernetworking/cni => github.com/containernetworking/cni v0.8.1 github.com/gorilla/websocket => github.com/gorilla/websocket v1.4.2 - huawei.com/npu-exporter/v5 => gitee.com/ascend/ascend-npu-exporter/v5 v5.0.0 + huawei.com/npu-exporter/v5 => gitee.com/ascend/ascend-npu-exporter/v5 v5.0.1-B020 k8s.io/api => k8s.io/api v0.25.13 k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.25.13 k8s.io/apimachinery => k8s.io/apimachinery v0.25.13 diff --git a/go.sum b/go.sum index bfae24a9..ef7fa1e8 100644 --- a/go.sum +++ b/go.sum @@ -1,6 +1,6 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= -gitee.com/ascend/ascend-npu-exporter/v5 v5.0.0 h1:hA93VQJXkYbkLjwNlIwL0C6krpIhsQSqIJ3+l+fiD9Q= -gitee.com/ascend/ascend-npu-exporter/v5 v5.0.0/go.mod h1:uAjPsLJAJ89j/+yMXwoiEhQ037443MoTfG1afXXXdHc= +gitee.com/ascend/ascend-npu-exporter/v5 v5.0.1-B020 h1:VKTQRxkzi2FCrWpeZDBvX2gPYS0FkaSHaEw0MyGjDY4= +gitee.com/ascend/ascend-npu-exporter/v5 v5.0.1-B020/go.mod h1:uAjPsLJAJ89j/+yMXwoiEhQ037443MoTfG1afXXXdHc= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/Microsoft/go-winio v0.4.17 h1:iT12IBVClFevaf8PuVyi3UmZOVh4OqnaLxDTW2O6j3w= github.com/Microsoft/go-winio v0.4.17/go.mod h1:JPGBdM1cNvN/6ISo+n8V5iA4v8pBzdOpzfwIujj1a84= diff --git a/pkg/common/fault_code.go b/pkg/common/fault_code.go index 3c44cac0..a76a52d1 100644 --- a/pkg/common/fault_code.go +++ b/pkg/common/fault_code.go @@ -61,6 +61,8 @@ const ( // LinkDownFaultCodeStr linkdown fault code string LinkDownFaultCodeStr = "81078603" + // CardDropFaultCode card drop fault code + CardDropFaultCode = 0x40F84E00 faultCodeFilePath = "/usr/local/faultCode.json" ) @@ -978,3 +980,8 @@ func LinkDownTimeoutCheck(device *NpuDevice) { device.NetworkHealth = device.NetworkRealHealth } + +// CheckErrorMessage check whether the error message contains a specific string +func CheckErrorMessage(err error, target string) bool { + return err != nil && strings.Contains(err.Error(), target) +} diff --git a/pkg/common/proto.go b/pkg/common/proto.go index 85479fd2..cf633bd3 100644 --- a/pkg/common/proto.go +++ b/pkg/common/proto.go @@ -64,6 +64,7 @@ type NpuDevice struct { Health string NetworkRealHealth string NetworkHealth string + CardDrop bool IP string LogicID int32 PhyID int32 diff --git a/pkg/device/ascend910.go b/pkg/device/ascend910.go index 5d1dd473..8c474522 100644 --- a/pkg/device/ascend910.go +++ b/pkg/device/ascend910.go @@ -136,11 +136,13 @@ func (hnm *HwAscend910Manager) DoWithVolcanoListAndWatch(classifyDevs map[string } } -func (tool *AscendTools) getDeviceNetworkState(logicID int32) string { +func (tool *AscendTools) getDeviceNetworkState(logicID int32, initStatus string) string { healthCode, err := tool.dmgr.GetDeviceNetWorkHealth(logicID) if err != nil { - hwlog.RunLog.Warnf("get logicID %d network health failed, error code is %d", logicID, healthCode) - return v1beta1.Unhealthy + hwlog.RunLog.Warnf("get logicID %d network health status failed, network health code is %d, "+ + "network health status will not change", + logicID, healthCode) + return initStatus } switch healthCode { diff --git a/pkg/device/ascendcommon.go b/pkg/device/ascendcommon.go index 8db3c21b..3ef6a281 100644 --- a/pkg/device/ascendcommon.go +++ b/pkg/device/ascendcommon.go @@ -949,7 +949,7 @@ func (tool *AscendTools) handleDeviceNetworkFault(device *common.NpuDevice, common.GetLinkdownLinkupFaultEvents(device.LogicID, devFaultInfoMap[device.LogicID]) if common.UseGetDeviceNetWorkHealthApi { - deviceNetworkHealth := tool.getDeviceNetworkState(device.LogicID) + deviceNetworkHealth := tool.getDeviceNetworkState(device.LogicID, device.NetworkHealth) common.GetCurrentDeviceNetWorkHealth(device.LogicID, deviceNetworkHealth) } diff --git a/pkg/server/manager.go b/pkg/server/manager.go index 05306d12..a623ea32 100644 --- a/pkg/server/manager.go +++ b/pkg/server/manager.go @@ -305,6 +305,9 @@ func (hdm *HwDevManager) ListenDevice(ctx context.Context) { common.UnlockAllDeviceInfo() continue } + // complete the fault codes that cannot be reported by the event subscribe interface + hdm.mendSubscribeFaultEvent() + hdm.notifyToK8s(&initTime) hdm.useVolcanoNotify() hdm.chipHotReset() @@ -901,3 +904,52 @@ func getFaultCodeCMPollInterval(configMap *v1.ConfigMap) int { } return interval } + +func (hdm *HwDevManager) mendSubscribeFaultEvent() { + hdm.handleDropCardFault() +} + +func (hdm *HwDevManager) handleDropCardFault() { + if common.SubscribeFailed { + return + } + + for _, npuDevices := range hdm.groupDevice { + for _, npuDevice := range npuDevices { + hdm.generateCardDropFaultEvent(npuDevice) + } + } +} + +func (hdm *HwDevManager) generateCardDropFaultEvent(npuDevice *common.NpuDevice) { + if !npuDevice.CardDrop && hdm.checkCardDropFault(npuDevice.LogicID) { + faultInfo := npuCommon.DevFaultInfo{ + EventID: common.CardDropFaultCode, + LogicID: npuDevice.LogicID, + Assertion: npuCommon.FaultOccur, + } + npuDevice.CardDrop = true + common.SaveDevFaultInfo(faultInfo) + } + + if npuDevice.CardDrop && !hdm.checkCardDropFault(npuDevice.LogicID) { + faultInfo := npuCommon.DevFaultInfo{ + EventID: common.CardDropFaultCode, + LogicID: npuDevice.LogicID, + Assertion: npuCommon.FaultRecover, + } + npuDevice.CardDrop = false + common.SaveDevFaultInfo(faultInfo) + } +} + +func (hdm *HwDevManager) checkCardDropFault(logicID int32) bool { + _, err := hdm.manager.GetDmgr().GetDeviceHealth(logicID) + if common.CheckErrorMessage(err, npuCommon.DeviceNotReadyErrCodeStr) { + hwlog.RunLog.Errorf("logic id %d, error message contains %s, device does not ready, "+ + "the card may be dropped", logicID, npuCommon.DeviceNotReadyErrCodeStr) + return true + } + + return false +} -- Gitee From 69643159199b11b55dd0c771f315d5286866c6af Mon Sep 17 00:00:00 2001 From: chengjunhua Date: Thu, 13 Jun 2024 16:48:20 +0800 Subject: [PATCH 10/11] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=20=E6=94=AF=E6=8C=81=E5=8D=95?= =?UTF-8?q?=E6=AC=A1=E6=95=85=E9=9A=9C=E5=8D=87=E7=BA=A7=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/common/constants.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/common/constants.go b/pkg/common/constants.go index 84efd6a3..05a4b86f 100644 --- a/pkg/common/constants.go +++ b/pkg/common/constants.go @@ -595,7 +595,7 @@ const ( // MaxFaultFrequencyTimes is the max count for the fault occurrence time of fault frequency MaxFaultFrequencyTimes = 100 // MinFaultFrequencyTimes is the min count for the fault occurrence time of fault frequency - MinFaultFrequencyTimes = 2 + MinFaultFrequencyTimes = 1 // DefaultLinkUpTimeout is the default time for the linkup event DefaultLinkUpTimeout = 60 // MinLinkUpTimeout is the min time for the linkup event -- Gitee From 6b7ee8513553eea2686cf3ff74ad5d8998379705 Mon Sep 17 00:00:00 2001 From: chengjunhua Date: Wed, 21 Aug 2024 22:46:03 +0800 Subject: [PATCH 11/11] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=20=E6=94=AF=E6=8C=81=E5=A4=8D?= =?UTF-8?q?=E4=BD=8D=E6=97=B6=E4=B8=8A=E6=8A=A5=E5=A4=8D=E4=BD=8D=E7=8A=B6?= =?UTF-8?q?=E6=80=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/common/constants.go | 5 +++++ pkg/common/proto.go | 2 ++ pkg/device/ascend910.go | 1 + pkg/device/ascendcommon.go | 12 ++++++++++-- 4 files changed, 18 insertions(+), 2 deletions(-) diff --git a/pkg/common/constants.go b/pkg/common/constants.go index 40d5bce7..5f0ed8ec 100644 --- a/pkg/common/constants.go +++ b/pkg/common/constants.go @@ -753,3 +753,8 @@ const ( // Subscribe represents subscribe mode Subscribe = "subscribe" ) + +const ( + NPUNormalStatus = "normal" + NPUResettingStatus = "resetting" +) \ No newline at end of file diff --git a/pkg/common/proto.go b/pkg/common/proto.go index 3a9215d8..8a27b86e 100644 --- a/pkg/common/proto.go +++ b/pkg/common/proto.go @@ -80,6 +80,7 @@ type NpuDevice struct { LogicID int32 PhyID int32 CardID int32 + Status string } // DavinCiDev davinci device @@ -153,6 +154,7 @@ type DevStatusSet struct { UnHealthyDevice sets.String NetUnHealthyDevice sets.String HealthDevices sets.String + RecoveringDevices sets.String FreeHealthyDevice map[string]sets.String DeviceFault []DeviceFault } diff --git a/pkg/device/ascend910.go b/pkg/device/ascend910.go index 0f87f8a0..68823b17 100644 --- a/pkg/device/ascend910.go +++ b/pkg/device/ascend910.go @@ -216,6 +216,7 @@ func (hnm *HwAscend910Manager) setAllDevUnhealthyOnRing(classifyDevs map[string] for devIndex := startDevIndex; devIndex < endDevIndex; devIndex++ { devStatusList[devIndex].NetworkHealth = v1beta1.Unhealthy devStatusList[devIndex].Health = v1beta1.Unhealthy + devStatusList[devIndex].Status = common.NPUResettingStatus } return nil } diff --git a/pkg/device/ascendcommon.go b/pkg/device/ascendcommon.go index 2b1698e9..903272b6 100644 --- a/pkg/device/ascendcommon.go +++ b/pkg/device/ascendcommon.go @@ -388,7 +388,8 @@ func (tool *AscendTools) getRealUsedDevices() sets.String { func (tool *AscendTools) getDevStatesDevSet(classifyDevs map[string][]*common.NpuDevice) common.DevStatusSet { totalFreeDevices := make(map[string]sets.String, len(classifyDevs)) - totalUHDevices, totalNetUHDevices, allTypeUsedDevice := sets.String{}, sets.String{}, sets.String{} + totalUHDevices, totalNetUHDevices, allTypeUsedDevice, totalRCDevices := + sets.String{}, sets.String{}, sets.String{}, sets.String{} totalDeviceFaults := make([]common.DeviceFault, 0, common.GeneralMapSize) if !common.ParamOption.PresetVDevice { allTypeUsedDevice = tool.getRealUsedDevices() @@ -402,18 +403,21 @@ func (tool *AscendTools) getDevStatesDevSet(classifyDevs map[string][]*common.Np } totalUHDevices = totalUHDevices.Union(partDevStatusSet.UnHealthyDevice) totalNetUHDevices = totalNetUHDevices.Union(partDevStatusSet.NetUnHealthyDevice) + totalRCDevices = totalRCDevices.Union(partDevStatusSet.RecoveringDevices) totalDeviceFaults = append(totalDeviceFaults, partDevStatusSet.DeviceFault...) } return common.DevStatusSet{ FreeHealthyDevice: totalFreeDevices, UnHealthyDevice: totalUHDevices, NetUnHealthyDevice: totalNetUHDevices, + RecoveringDevices: totalRCDevices, DeviceFault: totalDeviceFaults, } } func (tool *AscendTools) groupDevsByStatus(subClassDevices []*common.NpuDevice, runMode string) common.DevStatusSet { - healthDevice, totalUHDevices, totalNetworkUHDevices := sets.String{}, sets.String{}, sets.String{} + healthDevice, totalUHDevices, totalNetworkUHDevices, totalRCDevices := + sets.String{}, sets.String{}, sets.String{}, sets.String{} deviceFaults := make([]common.DeviceFault, 0, common.GeneralMapSize) for _, device := range subClassDevices { if len(device.NetworkFaultCodes) != 0 || device.NetworkHealth == v1beta1.Unhealthy { @@ -439,6 +443,9 @@ func (tool *AscendTools) groupDevsByStatus(subClassDevices []*common.NpuDevice, if device.NetworkHealth == v1beta1.Unhealthy { totalNetworkUHDevices.Insert(device.DeviceName) } + if device.Status == common.NPUResettingStatus { + totalRCDevices.Insert(device.DeviceName) + } if device.Health == v1beta1.Healthy { healthDevice.Insert(device.DeviceName) continue @@ -458,6 +465,7 @@ func (tool *AscendTools) groupDevsByStatus(subClassDevices []*common.NpuDevice, HealthDevices: healthDevice, UnHealthyDevice: totalUHDevices, NetUnHealthyDevice: totalNetworkUHDevices, + RecoveringDevices: totalRCDevices, DeviceFault: deviceFaults, } } -- Gitee